HWI-ST560:74:D14ELACXX:4:1201:8827:168386 pr1 chr1 10024104 50 50M = 10020756 -3398 TGGTTCTTGAAACTGCTGGTTCAGCATCTGTGTACTAACATCAATCCCGG IJJJJJJJIIIGJJJJJJJJJJJJJJJJJJJJIJJJJHHHHHFFFFFCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:50 YT:Z:UU NH:i:1 XS:A:+
HWI-ST560:74:D14ELACXX:4:1201:8827:168386 pR2 chr1 10020756 50 36M1628N14M = 10024104 3398 GATGATTTGAAATATGAGACTTCTAAGGCATAATATTGTTTGCAGTGCAC CCCFFFFFHHHHHJJJJJJJJJJJJJJJJJJJJIIIIJJJJIJIJHJGEC AS:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:50 NM:i:0 XS:A:- NH:i:1
This is a dUTP protocol where "R2/r1" FLAG for the read pair indicate the reads are a transcript on the + strand, which means both reads should be assigned as XS:A:+. However /2 is assigned to XS:A:-.
I've written an awk script to solve the problem:
#!/bin/awk -f
BEGIN{
if(save_discrepancy_to_file!="") system("[ -e " save_discrepancy_to_file " ] && rm " save_discrepancy_to_file);
}
{
if($1 ~ /^@/) print;
else
{
for(i=1;i<=NF;i++) if($i!~/^XS/) printf("%s\t",$i); else XS0=$i;
XS1=XS0;
if($2~/^0x/ || $2~/^[0-9]+$/){ # FLAG in HEX or Decimal format
if(libtype=="fr-firststrand") XS1=((and($2, 0x10) && and($2, 0x40)) || (and($2,0x80) && !and($2,0x10)))?"XS:A:+":"XS:A:-";
if(libtype=="fr-secondstrand") XS1=((and($2, 0x10) && and($2, 0x80)) || (and($2,0x40) && !and($2,0x10)))?"XS:A:+":"XS:A:-";
}
else if($2~/^[:alpha:]/){ # FLAG in string
if(libtype=="fr-firststrand") XS1=($2~/r.*1/ || ($2~/2/ && $2!~/r/))?"XS:A:+":"XS:A:-";
if(libtype=="fr-secondstrand") XS1=($2~/r.*2/|| ($2~/1/ && $2!~/r/))?"XS:A:+":"XS:A:-";
}
print XS1;
if(save_discrepancy_to_file!="" && XS1!=XS0) print >> save_discrepancy_to_file;
}
}
No comments:
Post a Comment