Seqanswers Leaderboard Ad

**gsgs** · 12-20-2012, 12:12 AM

yes, I just compare 2 (nucleotide) files for common substrings.
It doesn't care about alignment and no matter how many
sequences,headers,additional stuff there is in the files.
C-source ( or windows-executable

http://magictour.free.fr/seq/epifox.exe

the smaller file should not be more than ~300M
nucleotides

// improvement ideas: option to display matching epitopes
// sequence-numbers also for file 1
// allow normal fasta format, other formats
// allow/auto-detect 1-line headers or no-headers
// option to count all matching substrings of length 1,2,..,15 simultaneously
// better documentation of output in modes 5,7,
// explain definitions of substring,subsequence (wikipedia)

#include <stdio.h>

int me7,cs2=0,cs3=0,cs4=0,c4,mode,lf,z,s,c3,p1,p2,q,q1,q2,p=0,c2,i1,le,m4,xi,m1=0,min,max,y,i,j,m,n,c,f,a;
long long x,b;
unsigned char bits[333],zw[9],B[999],C[999],T[333];
unsigned char Z[135111111];

FILE *file;

//------------------------------------------------------------
int main(int argc,char*argv[]){

//printf("argv=%i,argv[1][0]=%i\n",argc,argv[1][0]);exit(8);

if(argc==2 && argv[1][0]<46){printf("\nusage:epifox file1 file2 [Lf] [Le] \n\n");
printf("counts the nucleotide-epitopes (=substrings) of length Lf from file 2 \n");
printf(" each of whose substrings of length Le do occur in file1\n\n");
printf(" Lf=substring-length , default Lf=28 \n");
printf(" Le=subsubstring-length , default Le=15 (specify Lf too !) \n\n");
printf(" Lf<15 is interpreted as mode-number \n");
printf(" Lf=3:short mode, Lf=2:very short mode \n");
printf(" Lf=5,7 : individual sequences from file2 (2-line-fasta format) \n");
printf(" \n\n version 0.1, 2012.12.03 \n");
printf(" written by gsgs and put into the public domain \n");
exit(1);}

if(argc<4){printf("\nusage:epifox file1 file2 \n\n");
printf("compares the nucleotide files file1 and file2 \n");
printf(" by counting their common substrings (epitopes) \n");
printf(" \n\n type epifox -h for more details with the advanced menu \n");
exit(1);}

le=15;lf=28;if(argc>3)sscanf(argv[3],"%i",&lf);mode=lf;if(lf<15 || lf>44)lf=28;

//printf("argc=%i lf=%i\n",argc,lf);exit(8);

for(i=0;i<277;i++)T[i]=0;T[67]=1;T[71]=2;T[84]=3;
T[67+32]=1;T[71+32]=2;T[84+32]=3;
b=0;for(i=1;i<le;i++)b=b+b+b+b+3;
zw[0]=1;zw[1]=2;zw[2]=4;zw[3]=8;zw[4]=16;zw[5]=32;zw[6]=64;zw[7]=128;
for(i=0;i<256;i++)for(j=0;j<8;j++)if((1<<j)&i)bits[i]++;

// ---------------- mark the 4^15 - array from entries from file 1

if((file=fopen(argv[1],"rb"))==NULL){printf("\ncan't open file %s\n",argv[1]);exit(1);}

m=0;z=0;
m31:m++;i=0;x=0;
m32:if(feof(file))goto m33;a=fgetc(file);// if(a==10)goto m31;
if(a==62){m36:if(feof(file))goto m33;a=fgetc(file);if(a!=10)goto m36;goto m32;}
if(a<63)goto m32; //## for fasta files
if(a==65 || a==97 || T[a]>0){i++;x=x&b;x=x+x+x+x+T[a];
if(i>=lf){z++;Z[x>>3]|=zw[x&7];}
} goto m32;
m33:fclose(file);

c=0;for(i=0;i<(1<<27);i++)c+=bits[Z[i]];if(mode>13)printf("%i marked m=%i sequences z=%i epitopes\n",c,m,z);

if(mode==5){printf("i1,z,c2,c3,c4,cs2,cs3,cs4\n");}

//-----------------main,file 2------------------------------

if((file=fopen(argv[2],"rb"))==NULL){printf("\ncan't open file %s\n",argv[2]);exit(1);}

m40:c4=0;;c2=0;c3=0;m1=0;
m41:m1++;i1=0;for(i=0;i<111;i++)B[i]=0;i=-1;x=0;c=0;
m42:if(feof(file))goto m43;a=fgetc(file);

if(mode==5 && a==10){cs2+=c2;cs3+=c3;cs4+=c4;printf("%i,%i,%i,%i,%i, %i,%i,%i\n",i1,z,c2,c3,c4,cs2,cs3,cs4);goto m40;}
if(mode==7 && a==10){printf("\n");cs2+=c2;cs3+=c3;cs4+=c4;goto m40;}

if(a==65 || a==67 || a==71 || a==84)c4++;
if(a==62){m46:if(feof(file))goto m43;a=fgetc(file);if(a!=10)goto m46;goto m42;}
if(a<63)goto m42; //## for fasta files
if(a!=65 && a!=65+32 && T[a]==0)goto m42;

i1++;i++;if(i==lf)i=0;
B[i]=a;B[i+lf]=a;B[i+lf+lf]=a;x=x&b;x=x+x+x+x+T[a];
C[i]=0;C[i+lf]=0;C[i+lf+lf]=0;

if((Z[x>>3]&zw[x&7])==0){if(mode==7)if(me7>0)printf("%5i,",i1);me7=0;}

if(Z[x>>3]&zw[x&7] && i1>=lf)
{c=1;C[i]=1;C[i+lf]=1;C[i+lf+lf]=1;c2++;me7=1;

s=0;for(j=i+le;j<=i+lf;j++)s+=C[j];
if(s==lf-le+1){
// for(p1=1;p1<=lf;p1++)printf("%c",B[p1+i]);printf(",");

c3++;}

}
goto m42;

m43:fclose(file);

m83:;

if(mode>13)printf("%i records(m1) %i total 15-(sub)matches(c2) %i likely 28er(all 15-substrings) %i total nucleotides\n",m1,c2,c3,c4);

if(mode==2)printf("m1=%i,c2=%i,c3=%i,c=%i,m=%i,z=%i c4=%i\n",m1,c2,c3,c,m,z,c4);
b=z;x=c2;x=x*1000;x=x/b;c2=x;b=z;x=c3;x=x*1000;x=x/b;c3=x;
if(mode==3)printf("%3i,%3i,%s,%s\n",c2,c3,argv[1],argv[2]);

}

**RickBioinf** · 12-20-2012, 12:19 AM

Hi OllyBolly,

I am doing this right now and am using multiple different databases (RDP, Silva and Greengenes).
If you look at: MEGAN-download you will see that you can download the "GI accession number to NCBI taxon ids" (binary) files, these can do what the title says when you give them to MEGAN.
You can also see the "Synonyms file for processing BLAST against Silva database" (tab delimeted file), I noticed that when I use the 3 databases that they all have these numbers in them though not always return this synonym text. So if you want to use RDP or Greengenes look at your returned data and determine if you need to make a little program to get the right text.

Rick

Topics	Statistics	Last Post
Expanding the Horizons of Cellular Research with the Single Cell Atlas by seqadmin Started by seqadmin, Yesterday, 11:49 AM	0 responses 15 views 0 likes	Last Post by seqadmin Yesterday, 11:49 AM
Genetic Variants and Diabetes Risk in Childhood Cancer Survivors by seqadmin Started by seqadmin, 04-24-2024, 08:47 AM	0 responses 16 views 0 likes	Last Post by seqadmin 04-24-2024, 08:47 AM
Cancer Metastasis: A Deep Dive into Cellular Plasticity by seqadmin Started by seqadmin, 04-11-2024, 12:08 PM	0 responses 62 views 0 likes	Last Post by seqadmin 04-11-2024, 12:08 PM
Proteogenomic Profiles Offer New Clues in Prostate Cancer by seqadmin Started by seqadmin, 04-10-2024, 10:19 PM	0 responses 60 views 0 likes	Last Post by seqadmin 04-10-2024, 10:19 PM

Seqanswers Leaderboard Ad

Announcement

Custom Blast database for MEGAN metagenomics

Comment

Comment

Latest Articles

ad_right_rmr

News