c - CUDA code not processing if block properly -
stuck @ if block right below //step 5, issue code not progress or after given if block. need figure out how particular issue settled before starting task of generating parallel code. if run code see 1 print statement indicates value of "one" , 2 "i" , "j". after if block begins, none of other print statements hit. result quite stuck, aware specific issue, however, cannot seem determine it's cause.
any appreciated! in advance!
input file sample.
>386.fasta.screen.contig1 gagtttgatcctggctcagaatcaacgctggcggcgcgcttaacacatgc aagtcgaacgagaaagtggagcaatccatgagtacagtggcgtacgggtg agtaacacgtgggtaatctacctcttagtggggaataactttgggaaacc gaagctaataccgcataagctcgagagaggaaagcagcaatgcgctgaga gaggagcccgcggccgattagctagttggcagggtaaaagcctaccaagg cagagatcggtagccggcctgagagggcacacggccacactggcactgaa acacgggccagactcctacgggaggcagcagtggggaatcttgcacaatg ggggcaaccctgatgcagcgacgccgcgtgagcgatgaagcccttcgggg tgtaaagctctttcgtcagggaagatagtgacggtacctggagaagcagc tgcggctaactacgtgccagcagccgcggtaatacgtaggcagcgagcgt tgttcggagttactgggcgtaaagggtgtgtaggcggttgtttaagtttg gtgtgaaatctcccggctcaactgggagggtgcgccgaatactgagcgac tagagtgcgggagaggaaagtggaattcctggtgtagcggtgaaatgcgt agatatcaggaggaacaccggtggtgtagacggctttctggaccgtaact gacgctgagacacgaaagcgtgggtagcaaacaggattagataccctggt agtccacgccctaaacgatgcatatttggtgtgggcagttcattctgtcc gtgccggagctaacgcgttaaatatgccgcctggggagtacagtcgcaag gctgaaactcaaaggaattgacgggggcccgcacaagcggtggagcatgt ggtttaattcgacgcaacgcgaagaaccttacctgggctcgaacggcttc ccaacgccggtagaaatatcggtaccccgcaagggggtggaatcgaggtg ctgcatggctgtcgtcagctcgtgtcgtgagatgttgggttaagtcccgc aacgagcgcaacccttgtcctgtgttgccatgccgcaaggcggcactcgc aggagaccgccagcgataagctggaggaaggtggggatgacgtcaagtcc tcatggcctttatgtccagggctacacacgtgctacaatggccggtacaa agcgtcgctaacctgcgaaggggagccaatcgcaaaaaaccggtctcagt tcggattgcaggctgcaacccgcctgcatgaagctggaatcgctagtaat ggcagatcagcacgctgccgtgaatacgttcccgggccttgtacacacat /******************************** based on code by: lorenzo seidenari (sixmoney@virgilio.it) *********************************/ #include <stdlib.h> #include <string.h> #include <stdio.h> #include <ctype.h> #define max_sequence_length 100000 int n; int m; int levenshtein_distance(char *s,char*t); int minimum(int a,int b,int c); //----------------------------------------------------------------------------- void cleanstring(char string[]) { //removes spaces string pointed "string", converts characters //to uppercase, , deletes terminating newline character. int i, current; int length = strlen(string); current = 0; for(i=0;i<length;i++) { if(string[i]=='\n') { string[current++] = '\0'; break; } else if(string[i]!=' ') { string[current++] = toupper(string[i]); } } } //----------------------------------------------------------------------------- int importfasta(char *filename, char *sequence) { //reads file, located @ path specified "filename", containing fasta //sequence. finds first full, complete sequence in file, stores //it in "sequence", , returns length of sequence, or -1 on failure. file *fastafile; char input[256]; int readflag; //set 1 once sequence has been read in int length; //open file if((fastafile = fopen(filename, "r")) == null) { return -1; } sequence[0] = '\0'; //read full first sequence, discarding unnecessary headers readflag=0; length = 0; while(fgets(input,256,fastafile)!=null) { //is header or comment? if(input[0]=='>' || input[0]==';') { if(readflag) break; else continue; } else readflag = 1; cleanstring(input); length += strlen(input); strncat(sequence,input,max_sequence_length-length - 1); } //add terminatng null character, in case sequence[length] = '\0'; fclose(fastafile); return length; } /****************************************/ /*implementation of levenshtein distance*/ /****************************************/ __global__ void levenshtein_distance(char *s,char*t, int one, int two) /*compute levenshtein distance between s , t*/ { //step 1 int k,i,j,cost,*d; int distance = 0; if(one!=0&&two!=0) { d=(int *)malloc((sizeof(int))*(two+1)*(one+1)); two++; one++; //step 2 for(k=0;k<one;k++){ d[k]=k; } for(k=0;k<two;k++){ d[k*one]=k; } //step 3 , 4 for(i=1;i<one;i++){ for(j=1;j<two;j++) { //step 5 printf("%d %d %d\n", one, i, j); if(s[i-1]==t[j-1]){ cost=0; printf("%d %d %d\n", one, i, j); } else{ cost=1; printf("%d %d %d\n", one, i, j); } printf("%d %d %d\n", one, i, j); //step 6 int min = d[(j-1)*one+i]+1; if (d[j*one+i-1]+1 < min) min = d[j*one+i-1]+1; if (d[(j-1)*one+i-1]+cost < min) min = d[(j-1)*one+i-1]+cost; d[j*one+i] = min; } distance=d[one*two-1]; free(d); printf("%d\n", distance); } } else printf ("-1"); } int main(int argc, char *argv[]) { char a[max_sequence_length+1]; char b[max_sequence_length+1]; if(argc < 3) { printf("usage: new_edit_distance <sequence1> <sequence2>\n"); printf("<sequence1>: file containing first sequence, fasta format\n"); printf("<sequence2>: file containing second sequence, fasta format\n"); return exit_failure; } n = importfasta(argv[1],a); m = importfasta(argv[2],b); levenshtein_distance<<<1, 1>>>(a,b, n, m); cudadevicesynchronize(); printf ("%s\n", cudageterrorstring(cudagetlasterror())); return exit_success; }
i now. took straight serial c/c++ code, dropped kernel, intended run kernel single thread, , want proceed there.
the idea plausible, you're missing key fact cuda , gpus: can't directly access host memory.
so when set , b this:
char a[max_sequence_length+1]; char b[max_sequence_length+1]; .... n = importfasta(argv[1],a); m = importfasta(argv[2],b);
those ordinary variables live in host memory. gpu (ordinary cuda) code can't directly access host memory. when pass pointers kernel this:
levenshtein_distance<<<1, 1>>>(a,b, n, m);
the gpu code try , dereference a
, b
pointers , fault (unspecified launch failure).
every cuda program has following basic sequence:
- copy data gpu
- perform computations on gpu
- copy results back
you've tried step 2 without step 1. won't work.
since i'm not able run program since don't have valid input files, i'll make following suggestion. assume know little or nothing cuda. try adding lines this:
n = importfasta(argv[1],a); // no change m = importfasta(argv[2],b); // no change char *d_a, *d_b; // add line cudamalloc(&d_a, max_sequence_length+1); // add line cudamalloc(&d_b, max_sequence_length+1); // add line cudamemcpy(d_a, a, max_sequence_length+1, cudamemcpyhosttodevice); // add cudamemcpy(d_b, b, max_sequence_length+1, cudamemcpyhosttodevice); // add levenshtein_distance<<<1, 1>>>(d_a,d_b, n, m); //modify parameters
n , m don't need handled differently since passing value.
and add proper cuda error checking code.
edit: after further analysis, it's clear sequence not correct:
distance=d[one*two-1]; free(d); printf("%d\n", distance); } }
you freeing d
on every iteration of i
loop. cannot possibly correct. suggest go square 1 , serial code working first, in ordinary serial c code, before dropping cuda kernel way. if move free
statement outside i
loop, kernel runs very long time. advised in-kernel printf
limited in amount of output can generated.
i'm not going debug code further you. serial code working first, figure out way create kernel without massive quantities of printout.
a final comment: said above approach "plausible". means made work, i.e produce same behavior same code executing on host. not mean run fast. not how acceleration out of gpu (running single block of single thread). assume know based on comment "how particular issue settled before starting task of generating parallel code." think disclaimer appropriate anyway.
Many thanks for sharing!
ReplyDeleteCRUD operations in python using mysql
password_verify in PHP
Matplotlib pie chart
Histogram equalization OpenCV
How to display PDF in HTML using javascript
Histogram OpenCV Python