This directory contains bowtie2-format targets consisting of the entries from repeatMasker. # ############ # And these from genbank etc. cre.fa lacz.fa egfp.fa phiX.fa # ############ # And these from Illumina iGenomes web site polyA.fa polyC.fa illumina_adapter1.fa set faList = ( \ cre.fa \ lacz.fa \ egfp.fa \ phiX.fa \ polyA.fa \ polyC.fa \ illumina_adapter1.fa \ ) set faCommaList = set faSpaceList = set init = 0 foreach fa ($faList) if ($init == 0) then set faCommaList = "${fa}" set faSpaceList = "${fa}" set init = 1 continue endif set faCommaList = "${faCommaList},${fa}" set faSpaceList = "${faSpaceList} ${fa}" end #set colorSpaceOption = " -C " set colorSpaceOption = set species = caenorhabditis_elegans set genome = ce10 set rptmaskSfx = cre_lacz_egfp_phiX set rptmaskName = ${genome}_repeatmask_${rptmaskSfx} set rptmaskDir = /hive/groups/wet/illumina/repeatMasker/${genome}/bowtie2_repeatMasker_${genome}_${rptmaskSfx} 1) make links to the libraries of repeatMasker elements. NOTE: these are found on /scratch/data on hgwdev, which is a link to /hive/data/staging/data: If the subsequent jobs are run on hgwdev, can use links in this directory to /scratch (sol@hgwdev)/scratch/data$ ls -l /scratch/data lrwxrwxrwx 1 hiram genecats 23 Aug 6 2013 /scratch/data -> /hive/data/staging/data (sol@hgwdev)/scratch/data$ ls -l /hive/data/staging/data | grep Repeat drwxrwxr-x 8 rhubley genecats 8192 Dec 29 2015 RMH-RepeatMasker lrwxrwxrwx 1 hiram genecats 18 May 20 2014 RepeatMasker -> RepeatMasker140131 drwxrwxr-x 6 angie genecats 8192 Sep 14 2011 RepeatMasker100630 drwxrwxr-x 6 angie genecats 8192 Jan 22 2013 RepeatMasker110426 drwxrwxr-x 6 hiram genecats 8192 Feb 15 2013 RepeatMasker130110 drwxr-xr-x 6 hiram genecats 8192 Mar 5 2013 RepeatMasker130220 drwxr-xr-x 6 hiram genecats 8192 Jun 20 2013 RepeatMasker130429 drwxr-xr-x 6 hiram genecats 8192 May 14 2014 RepeatMasker130620 drwxr-xr-x 6 hiram genecats 8192 Jan 15 2017 RepeatMasker140131 drwxrwxr-x 6 hiram genecats 8192 Jan 15 2017 RepeatMasker151029 cd $rptmaskDir set libList = ( \ specieslib \ ) foreach lib ($libList) ln -s -f /scratch/data/RepeatMasker140131/Libraries/20140131/${species}/${lib} . end 1b) Add a couple of sequences supplied by Al Zahler for rRNA, possibly redundant with the set in specieslib wget "http://elegans.ucsc.edu/ForSol/CElegansrRNAtRNAfilter.txt" manually edit to retain only 2 sequences in zahlerlib >az_rRNA_5S >az_rRNA_18S_5.8S_26S set libList = ( \ specieslib \ zahlerlib \ ) 2) Find the non-redundant set of elements in those libraries foreach lib ($libList) grep "^>" ${lib} | sed -e "s@>@@" | gawk '{print $1}' > names${lib} end echo -n "" > temp.extracted.names rm -f nonRedundantLib foreach lib ($libList) cat temp.extracted.names names${lib} | sort | uniq -c | gawk '($1 > 1){print $2}' > temp.skip.names faSomeRecords -exclude ${lib} temp.skip.names temp.one.nr.fa cat temp.one.nr.fa >> nonRedundantLib grep "^>" nonRedundantLib | sed -e "s@>@@" | gawk '{print $1}' > temp.extracted.names set numNRnames = `cat temp.extracted.names | wc -l` echo "$numNRnames non-redundant names after processing ${lib}" end 361 non-redundant names after processing specieslib 363 non-redundant names after processing zahlerlib cat names*lib | sort | uniq | wc -l 363 3) run bowtie2-build to make a merged index of the set of non-redundant repeatmask libraries for use with colorspace reads (-C option) from the fasta files (-f option). cd $rptmaskDir $BOWTIE2ROOT/bowtie2-build ${colorSpaceOption} --seed 1234567 -f ${faCommaList},nonRedundantLib ${rptmaskName} >& bowtie2-build.log NOTE: this build is very fast! 3b) sanity check the bowtie2 index $BOWTIE2ROOT/bowtie2-inspect ${rptmaskName} > temp.inspect.fa faCount -summary temp.inspect.fa #seq len A C G T N cpg total 428975 130368 94527 89943 113380 757 19755 prcnt 1.0 0.3039 0.2204 0.2097 0.2643 0.0018 0.0461 faSize temp.inspect.fa 428975 bases (757 N's 428218 real 428218 upper 0 lower) in 370 sequences in 1 files Total size: mean 1159.4 sd 2213.1 min 96 (CeRep58#Satellite) max 17003 (CER8-I_CE#LTR/Pao) median 193 N count: mean 2.0 sd 20.5 U count: mean 1157.3 sd 2213.8 L count: mean 0.0 sd 0.0 %0.00 masked total, %0.00 masked real faSize -detailed temp.inspect.fa > temp.inspect.sizes 4) Get the list of element names and lengths of the sequences: First convert the concatenated fasta into one line per record, then get the length of the records in Kbp cd $rptmaskDir cat ${faSpaceList} nonRedundantLib > temp.fullNRlib.fa $SEQUENCES_HOME/scripts/fastaOneLine < temp.fullNRlib.fa > temp.fullNRlib.oneLine.fa echo "#repeatName lenKb" | gawk 'BEGIN{OFS="\t"}{$1=$1;print}' > ${rptmaskName}.namesAndLen.txt cat temp.fullNRlib.oneLine.fa | \ sed -e "s@>@@" | \ gawk '(FNR % 2 == 1){printf("%s\t",$1)} \ (FNR % 2 == 0){printf("%5.2f\n",0.001*length($1))}' |\ sort -k 1,1 \ >> ${rptmaskName}.namesAndLen.txt rm -f temp*