This directory contains bowtie2-format targets
consisting of the entries from repeatMasker.

# ############
# And these from genbank etc.

cre.fa
lacz.fa
egfp.fa

phiX.fa

# ############
# And these from Illumina iGenomes web site

polyA.fa
polyC.fa
illumina_adapter1.fa


set faList = ( \
 cre.fa            \
 lacz.fa           \
 egfp.fa           \
 phiX.fa           \
 polyA.fa          \
 polyC.fa          \
 illumina_adapter1.fa          \
)

set faCommaList = 
set faSpaceList = 
set init = 0
foreach fa ($faList)
    if ($init == 0) then
        set faCommaList = "${fa}"
        set faSpaceList = "${fa}"
        set init = 1
        continue
    endif
    set faCommaList = "${faCommaList},${fa}"
    set faSpaceList = "${faSpaceList} ${fa}"
end


#set colorSpaceOption = " -C "
set colorSpaceOption =


set species = caenorhabditis_elegans

set genome = ce10
set rptmaskSfx  = cre_lacz_egfp_phiX
set rptmaskName = ${genome}_repeatmask_${rptmaskSfx}
set rptmaskDir = /hive/groups/wet/illumina/repeatMasker/${genome}/bowtie2_repeatMasker_${genome}_${rptmaskSfx}


1) make links to the libraries of repeatMasker elements.

   NOTE: these are found on /scratch/data on hgwdev, which is a link to /hive/data/staging/data:
   If the subsequent jobs are run on hgwdev, can use links in this directory to /scratch

   (sol@hgwdev)/scratch/data$ ls -l /scratch/data
   lrwxrwxrwx 1 hiram genecats 23 Aug  6  2013 /scratch/data -> /hive/data/staging/data

   (sol@hgwdev)/scratch/data$ ls -l /hive/data/staging/data | grep Repeat
   drwxrwxr-x  8 rhubley  genecats      8192 Dec 29  2015 RMH-RepeatMasker
   lrwxrwxrwx  1 hiram    genecats        18 May 20  2014 RepeatMasker -> RepeatMasker140131
   drwxrwxr-x  6 angie    genecats      8192 Sep 14  2011 RepeatMasker100630
   drwxrwxr-x  6 angie    genecats      8192 Jan 22  2013 RepeatMasker110426
   drwxrwxr-x  6 hiram    genecats      8192 Feb 15  2013 RepeatMasker130110
   drwxr-xr-x  6 hiram    genecats      8192 Mar  5  2013 RepeatMasker130220
   drwxr-xr-x  6 hiram    genecats      8192 Jun 20  2013 RepeatMasker130429
   drwxr-xr-x  6 hiram    genecats      8192 May 14  2014 RepeatMasker130620
   drwxr-xr-x  6 hiram    genecats      8192 Jan 15  2017 RepeatMasker140131
   drwxrwxr-x  6 hiram    genecats      8192 Jan 15  2017 RepeatMasker151029

   cd $rptmaskDir

   set libList = ( \
                 specieslib         \
                 )

   foreach lib ($libList)
       ln -s -f /scratch/data/RepeatMasker140131/Libraries/20140131/${species}/${lib}      .
   end

1b) Add a couple of sequences supplied by Al Zahler for rRNA, possibly redundant
    with the set in specieslib

   wget "http://elegans.ucsc.edu/ForSol/CElegansrRNAtRNAfilter.txt"

   manually edit to retain only 2 sequences in zahlerlib

   >az_rRNA_5S
   >az_rRNA_18S_5.8S_26S

   set libList = ( \
                 specieslib         \
                 zahlerlib          \
                 )

2) Find the non-redundant set of elements in those libraries

   foreach lib ($libList)
       grep "^>" ${lib} | sed -e "s@>@@" | gawk '{print $1}' > names${lib}
   end

   echo -n "" > temp.extracted.names
   rm -f nonRedundantLib
   foreach lib ($libList)
       cat temp.extracted.names names${lib} | sort | uniq -c | gawk '($1 > 1){print $2}' > temp.skip.names
       faSomeRecords -exclude ${lib} temp.skip.names temp.one.nr.fa
       cat temp.one.nr.fa >> nonRedundantLib
       grep "^>" nonRedundantLib | sed -e "s@>@@" | gawk '{print $1}' > temp.extracted.names
       set numNRnames = `cat temp.extracted.names | wc -l`
       echo "$numNRnames non-redundant names after processing ${lib}"
   end

   361 non-redundant names after processing specieslib
   363 non-redundant names after processing zahlerlib

   cat names*lib | sort | uniq | wc -l
   363


3) run bowtie2-build to make a merged index of the set of non-redundant repeatmask libraries
   for use with colorspace reads (-C option) from the fasta files (-f option).

   cd $rptmaskDir

   $BOWTIE2ROOT/bowtie2-build ${colorSpaceOption} --seed 1234567 -f ${faCommaList},nonRedundantLib ${rptmaskName} >& bowtie2-build.log
   NOTE: this build is very fast!


3b) sanity check the bowtie2 index

   $BOWTIE2ROOT/bowtie2-inspect ${rptmaskName} > temp.inspect.fa
   faCount -summary  temp.inspect.fa 
        
   #seq    len     A       C       G       T       N       cpg
   total   428975  130368  94527   89943   113380  757     19755
   prcnt   1.0     0.3039  0.2204  0.2097  0.2643  0.0018  0.0461


   faSize            temp.inspect.fa

   428975 bases (757 N's 428218 real 428218 upper 0 lower) in 370 sequences in 1 files
   Total size: mean 1159.4 sd 2213.1 min 96 (CeRep58#Satellite) max 17003 (CER8-I_CE#LTR/Pao) median 193
   N count: mean 2.0 sd 20.5
   U count: mean 1157.3 sd 2213.8
   L count: mean 0.0 sd 0.0
   %0.00 masked total, %0.00 masked real

   faSize  -detailed temp.inspect.fa > temp.inspect.sizes


4) Get the list of element names and lengths of the sequences:
   First convert the concatenated fasta into one line per record, then get the length of the records in Kbp

   cd $rptmaskDir

   cat    ${faSpaceList} nonRedundantLib > temp.fullNRlib.fa
   $SEQUENCES_HOME/scripts/fastaOneLine   < temp.fullNRlib.fa > temp.fullNRlib.oneLine.fa

   echo "#repeatName lenKb" | gawk 'BEGIN{OFS="\t"}{$1=$1;print}' > ${rptmaskName}.namesAndLen.txt

   cat temp.fullNRlib.oneLine.fa | \
   sed -e "s@>@@"        | \
   gawk '(FNR % 2 == 1){printf("%s\t",$1)} \
         (FNR % 2 == 0){printf("%5.2f\n",0.001*length($1))}' |\
   sort -k 1,1 \
                                                                 >>  ${rptmaskName}.namesAndLen.txt

   rm -f temp*