c - CUDA code not processing if block properly -


stuck @ if block right below //step 5, issue code not progress or after given if block. need figure out how particular issue settled before starting task of generating parallel code. if run code see 1 print statement indicates value of "one" , 2 "i" , "j". after if block begins, none of other print statements hit. result quite stuck, aware specific issue, however, cannot seem determine it's cause.

any appreciated! in advance!

input file sample.

>386.fasta.screen.contig1 gagtttgatcctggctcagaatcaacgctggcggcgcgcttaacacatgc aagtcgaacgagaaagtggagcaatccatgagtacagtggcgtacgggtg agtaacacgtgggtaatctacctcttagtggggaataactttgggaaacc gaagctaataccgcataagctcgagagaggaaagcagcaatgcgctgaga gaggagcccgcggccgattagctagttggcagggtaaaagcctaccaagg cagagatcggtagccggcctgagagggcacacggccacactggcactgaa acacgggccagactcctacgggaggcagcagtggggaatcttgcacaatg ggggcaaccctgatgcagcgacgccgcgtgagcgatgaagcccttcgggg tgtaaagctctttcgtcagggaagatagtgacggtacctggagaagcagc tgcggctaactacgtgccagcagccgcggtaatacgtaggcagcgagcgt tgttcggagttactgggcgtaaagggtgtgtaggcggttgtttaagtttg gtgtgaaatctcccggctcaactgggagggtgcgccgaatactgagcgac tagagtgcgggagaggaaagtggaattcctggtgtagcggtgaaatgcgt agatatcaggaggaacaccggtggtgtagacggctttctggaccgtaact gacgctgagacacgaaagcgtgggtagcaaacaggattagataccctggt agtccacgccctaaacgatgcatatttggtgtgggcagttcattctgtcc gtgccggagctaacgcgttaaatatgccgcctggggagtacagtcgcaag gctgaaactcaaaggaattgacgggggcccgcacaagcggtggagcatgt ggtttaattcgacgcaacgcgaagaaccttacctgggctcgaacggcttc ccaacgccggtagaaatatcggtaccccgcaagggggtggaatcgaggtg ctgcatggctgtcgtcagctcgtgtcgtgagatgttgggttaagtcccgc aacgagcgcaacccttgtcctgtgttgccatgccgcaaggcggcactcgc aggagaccgccagcgataagctggaggaaggtggggatgacgtcaagtcc tcatggcctttatgtccagggctacacacgtgctacaatggccggtacaa agcgtcgctaacctgcgaaggggagccaatcgcaaaaaaccggtctcagt tcggattgcaggctgcaacccgcctgcatgaagctggaatcgctagtaat ggcagatcagcacgctgccgtgaatacgttcccgggccttgtacacacat  /******************************** based on code by: lorenzo seidenari (sixmoney@virgilio.it) *********************************/  #include <stdlib.h> #include <string.h> #include <stdio.h> #include <ctype.h>  #define max_sequence_length 100000  int  n;  int  m; int levenshtein_distance(char *s,char*t); int minimum(int a,int b,int c);  //----------------------------------------------------------------------------- void cleanstring(char string[]) {   //removes spaces string pointed "string", converts characters   //to uppercase, , deletes terminating newline character.     int i, current;     int length = strlen(string);      current = 0;     for(i=0;i<length;i++) {         if(string[i]=='\n') {             string[current++] = '\0';             break;         }         else if(string[i]!=' ') {             string[current++] = toupper(string[i]);         }     } } //----------------------------------------------------------------------------- int importfasta(char *filename, char *sequence) {   //reads file, located @ path specified "filename", containing fasta   //sequence. finds first full, complete sequence in file, stores   //it in "sequence", , returns length of sequence, or -1 on failure.     file *fastafile;     char input[256];   int readflag; //set 1 once sequence has been read in   int length;    //open file   if((fastafile = fopen(filename, "r")) == null) {     return -1;   }    sequence[0] = '\0';    //read full first sequence, discarding unnecessary headers   readflag=0;   length = 0;   while(fgets(input,256,fastafile)!=null) {     //is header or comment?     if(input[0]=='>' || input[0]==';') {         if(readflag) break;         else continue;     }     else readflag = 1;      cleanstring(input);     length += strlen(input);      strncat(sequence,input,max_sequence_length-length - 1);   }   //add terminatng null character, in case   sequence[length] = '\0';    fclose(fastafile);   return length; }   /****************************************/ /*implementation of levenshtein distance*/ /****************************************/  __global__ void levenshtein_distance(char *s,char*t, int one, int two) /*compute levenshtein distance between s , t*/ {     //step 1     int k,i,j,cost,*d;     int distance = 0;     if(one!=0&&two!=0)     {         d=(int *)malloc((sizeof(int))*(two+1)*(one+1));         two++;         one++;         //step 2             for(k=0;k<one;k++){             d[k]=k;         }         for(k=0;k<two;k++){             d[k*one]=k;         }         //step 3 , 4           for(i=1;i<one;i++){             for(j=1;j<two;j++)             {                 //step 5                 printf("%d  %d  %d\n", one, i, j);                 if(s[i-1]==t[j-1]){                     cost=0;                     printf("%d  %d  %d\n", one, i, j);                 }                 else{                     cost=1;                     printf("%d  %d  %d\n", one, i, j);                 }                 printf("%d  %d  %d\n", one, i, j);                 //step 6                 int min = d[(j-1)*one+i]+1;                 if (d[j*one+i-1]+1 < min)                     min = d[j*one+i-1]+1;                 if (d[(j-1)*one+i-1]+cost < min)                     min = d[(j-1)*one+i-1]+cost;                 d[j*one+i] = min;                     }             distance=d[one*two-1];             free(d);             printf("%d\n", distance);         }     }         else             printf ("-1"); }  int main(int argc, char *argv[]) {     char a[max_sequence_length+1];     char b[max_sequence_length+1];      if(argc < 3) {         printf("usage: new_edit_distance <sequence1> <sequence2>\n");         printf("<sequence1>: file containing first sequence, fasta format\n");         printf("<sequence2>: file containing second sequence, fasta format\n");         return exit_failure;     }      n = importfasta(argv[1],a);     m = importfasta(argv[2],b);      levenshtein_distance<<<1, 1>>>(a,b, n, m);     cudadevicesynchronize();     printf ("%s\n", cudageterrorstring(cudagetlasterror()));      return exit_success; } 

i now. took straight serial c/c++ code, dropped kernel, intended run kernel single thread, , want proceed there.

the idea plausible, you're missing key fact cuda , gpus: can't directly access host memory.

so when set , b this:

char a[max_sequence_length+1]; char b[max_sequence_length+1]; .... n = importfasta(argv[1],a); m = importfasta(argv[2],b); 

those ordinary variables live in host memory. gpu (ordinary cuda) code can't directly access host memory. when pass pointers kernel this:

levenshtein_distance<<<1, 1>>>(a,b, n, m); 

the gpu code try , dereference a , b pointers , fault (unspecified launch failure).

every cuda program has following basic sequence:

  1. copy data gpu
  2. perform computations on gpu
  3. copy results back

you've tried step 2 without step 1. won't work.

since i'm not able run program since don't have valid input files, i'll make following suggestion. assume know little or nothing cuda. try adding lines this:

n = importfasta(argv[1],a);              // no change m = importfasta(argv[2],b);              // no change  char *d_a, *d_b;                          // add line cudamalloc(&d_a, max_sequence_length+1);  // add line cudamalloc(&d_b, max_sequence_length+1);  // add line  cudamemcpy(d_a, a, max_sequence_length+1, cudamemcpyhosttodevice); // add  cudamemcpy(d_b, b, max_sequence_length+1, cudamemcpyhosttodevice); // add  levenshtein_distance<<<1, 1>>>(d_a,d_b, n, m); //modify parameters 

n , m don't need handled differently since passing value.

and add proper cuda error checking code.

edit: after further analysis, it's clear sequence not correct:

        distance=d[one*two-1];         free(d);         printf("%d\n", distance);     } } 

you freeing d on every iteration of i loop. cannot possibly correct. suggest go square 1 , serial code working first, in ordinary serial c code, before dropping cuda kernel way. if move free statement outside i loop, kernel runs very long time. advised in-kernel printf limited in amount of output can generated.

i'm not going debug code further you. serial code working first, figure out way create kernel without massive quantities of printout.

a final comment: said above approach "plausible". means made work, i.e produce same behavior same code executing on host. not mean run fast. not how acceleration out of gpu (running single block of single thread). assume know based on comment "how particular issue settled before starting task of generating parallel code." think disclaimer appropriate anyway.


Comments

Post a Comment

Popular posts from this blog

c++ - QTextObjectInterface with Qml TextEdit (QQuickTextEdit) -

javascript - angular ng-required radio button not toggling required off in firefox 33, OK in chrome -

xcode - Swift Playground - Files are not readable -