#This file documents the commands used to generate all RepeatBrowser tracks # # # # # #########Consensus Alignment track # Just a simple bar spanning the length of each consensus but with info on Dfam mappings etc # # Repeat Browser # DFAM Record # DFAM Type # DFAM Class # DFAM Organismn # DFAM Comment # RepeatMasker Names # Number of copies in genome, total # Copies in genome, per RepeatMasker Name # Liftable copies # Sequence Source(s) # Length of 99% percentile in genome # Maximum length in genome # Length of top50 alignment # Number of Ns in top50 consensus # Ratio of alignment length to top50 length # # Output: RepeatConsensusAlignment.bb ### wget https://hgwdev.gi.ucsc.edu/~max/kznf/hg38reps/tracks/build.bb mv build.bb ../hg38reps/RepeatConsensusAlignment.bb #########ORFs track # ORFs: Using Max's summary.tsv # For exact matches, search Dfam.embl for the name using the NM record in Dfam embl and grab the whole record (ending in \\). Then grep CDS features and print tab separated field with # For int-strip, do the same thing. # For everything else (that isn't a manual consensus) do the same thing. # So in the end I could have just used the same command for all three cases (awk '($6 != cons), but I had to think it through and assumed cases would be different going in. This is exactly what was run. # # For the built consensuses I first make a fasta file of built consensuses, and make a blast database/ # Then I subset the RepeatMasker peptide library to those species/phylogenetic groupings for TEs found in hg38 # Then I use tblastn to find the highest scoring gag and pol for each built consensus (since built consensuses are all LINEs) # The tblastn bitscore must also be above 50. Because of bed numbering vs tblastn I shifted start down one (since I noticed ATG becoming TG) # # Output: orfs.bb ### #for exact matches awk '($6=="exact")' summary.tsv | cut -f 1 | while read i; do sed -n -e '/^NM\s*'${i}'$/,/\/\// p' Dfam.embl | grep -A1 "FT.*CDS "| tr -s "\n" "\t" | sed "s/--/\n/g" | tr -s "." " \t" | tr -s " " "\t" | tr -s "\"" "\t" | awk -v s=$i '{print s"\t"$3"\t"$4"\t"$7}'; done > exact_annotations.tab #for renamed matches awk '($6=="int-strip")' summary.tsv | cut -f 1 | while read i; do sed -n -e '/^NM\s*'${i}'$/,/\/\// p' Dfam.embl | grep -A1 "FT.*CDS "| tr -s "\n" "\t" | sed "s/--/\n/g" | tr -s "." " \t" | tr -s " " "\t" | tr -s "\"" "\t" | awk -v s=$i '{print s"\t"$3"\t"$4"\t"$7}'; done > intstrip_annotations.tab #for manual matches awk '($6!="int-strip" && $6!="exact" && $6!="cons")' summary.tsv | cut -f 1 | while read i; do sed -n -e '/^NM\s*'${i}'$/,/\/\// p' Dfam.embl | grep -A1 "FT.*CDS "| tr -s "\n" "\t" | sed "s/--/\n/g" | tr -s "." " \t" | tr -s " " "\t" | tr -s "\"" "\t" | awk -v s=$i '{print s"\t"$3"\t"$4"\t"$7}'; done > manual_annotations.tab cat intstrip_annotations.bed manual_annotations.bed exact_annotations.bed | awk '{print $0"\t0\t+"}'| sort | uniq > Dfam_annotations.bed #built cons awk '($6=="cons")' ../summary.tsv | cut -f 1 | while read i; do sed -n -e '/^>'${i}'$/,/^>/ p' ~/hive/jferna10/RepeatBrowserHub/hg38reps/hg38reps.fa | head -n-1 >> built_cons.fa; done cd peps/ #RepeatMasker pep library from Hubley wget https://github.com/rmhubley/RepeatMasker/raw/master/Libraries/RepeatPeps.lib # makeblastdb -in built_cons.fa -dbtype nucl -parse_seqids #phylogenetic groupings for TEs found in hg38 (this list was arrived at by taking the Dfam name, reduce repeat peptide libs to peptides from those groups cut -f 1 summary.tsv | while read i ; do sed -n -e '/^NM\s*'${i}'$/,/\/\// p' Dfam.embl | grep "^OS" | cut -f2; done > hg38_TE_phlyo.txt cat hg38_TE_phlyo.txt | tr -s " " "\t" | cut -f2 | sort | uniq | tr "\n" "|" #copy and paste output to grep,reduces peptide library significantly, prevents aligning proteins from random creatures to human grep -E 'Afrotheria|Amniota|Boreoeutheria|Carnivora|Catarrhini|Cetartiodactyla|Euarchontoglires|Euteleostomi|Eutheria|Glires|Haplorrhini|Hominidae|Homininae|Hominoidea|Homo|Mammalia|Metatheria|Metazoa|Monotremata|Primates|Rodentia|Sauropsida|Scandentia|Simiiformes|Tetrapoda|Theria|Vertebrata|Xenarthra' RepeatPeps.lib -A1 > HumanRepeatPeps.lib grep ">" built_cons.fa | cut -f 2 -d">" | while read i; do grep -A1 "$i" HumanRepeatPeps.lib;done > Human_built_cons_peps.lib sed -zi "s/--\n//g" Human_built_cons_peps.lib #find peptides in consensuses tblastn -db built_cons.fa -query Human_built_cons_peps.lib -outfmt 6 > built_consensus_annotations.tab cat built_consensus_annotations.tab | sort | uniq | sort -k 2,12 -n -r > temp.tab #take the highest scoring gag and pol, (since all built consesnsus are L1s); it must also be above 50 grep ">" built_cons.fa | cut -f 2 -d">" | while read i; do grep -P "\t$i\t" temp.tab | grep "gag" | sort -k 12 -n -r | head -n+1 | awk '($12> 50) {print $2"\t"$9-1"\t"$10"\t"$1"\t"$12"\t+"}';done | sort | uniq > cons_gag.bed grep ">" built_cons.fa | cut -f 2 -d">" | while read i; do grep -P "\t$i\t" temp.tab | grep "pol" | sort -k 12 -n -r | head -n+1 | awk '($12> 50) {print $2"\t"$9-1"\t"$10"\t"$1"\t"$12"\t+"}';done | sort | uniq > cons_pol.bed mv *.bed ../ cd .. #combine and change score to 0 (avoid issues) cat Dfam_annotations.bed cons_gag.bed cons_pol.bed HERV_full.bed| awk '{print $1"\t"$2"\t"$3"\t"$4"\t"0"\t"$6}' | sed "s/L1PBA1/L1PBa1/g" | bedtools sort > ORFs.bed bedToBigBed ORFs.bed ../hg38reps/hg38reps.sizes ORFs.bb mv ORFs.bb ../hg38reps/orfs.bb #########knownGene32 # liftOver of the genes to hg38reps # # Get knownGene table from UCSC # swap out ENST with chr:start-stop and make bed12 # Also make a keyfile with ENST, name and bed coords in fields, also saved thickStart,thickStop and strand # # # Something strange is going on...if I lift just exons, LOTS of liftover. Lose coding/non-coding annotations though :( # Lifting bed12 requires messing with liftOver setting, preserved most coding/non-coding annotations, althouhg occasionally messed up # Also missing some hits that repeats2 picked up and should be picked up by the chain (e.g. liftOver successful on just exons) # More work to be done here, check with Max. This is some issue with liftOver and bed12 (it doesn't like the gene being partially deleted). # Could use exon liftover, LOTS of hits that way, how do I preserved UTR/CDS info if just lifting a bed 6? # # Output: RepBro_knownGene.bb ### wget https://hgdownload.soe.ucsc.edu/gbdb/hg38/knownGene32.bb bigBedToBed knownGene32.bb knownGene32.bed cut -f 1-12 knownGene32.bed > knownGene32_12.bed #sub out ENST with chr coords, make bed12 awk '$4=$1":"$2"-"$3' knownGene32.bed | tr -s " " "\t" | cut -f 1-12 > knownGene32_12.bed #sub out score with chr coords, build key file awk '$5=$1":"$2"-"$3' knownGene32.bed | tr -s " " "\t" | cut -f 4-8,10,17 > knownGene32_keyfile.txt liftOver knownGene32_12.bed ../lift/hg38_to_hg38reps.over.chain RepBro_temp.bed RepBro_temp.unmapped -multiple -minBlocks=0.001 -fudgeThick -minMatch=0.0005 -minSizeQ=0 #bedSort RepBro_temp.bed RepBro_temp.bed join -1 4 -2 2 <(sort -k4 RepBro_temp.bed) <(sort -k2 knownGene32_keyfile.txt) | awk '{print $2"\t"$3"\t"$4"\t"$18"\t0\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$1"\t"$13}' | sort | uniq | bedtools sort > temp.bed if ($7<$3){$7=$3} else if ($7>$4) {$7=0;$8=0} else if ($8>$3){$8=$3} else if ($8<$3) {$7=0;$8=0}; {if ($7>=$8){$7=0;$8=0}}; bedToBigBed temp.bed ../hg38reps/hg38reps.sizes RepBro_knownGene.bb -type=bed12+2 #thickstarts occasionally miscalcualted #correct thickstart > thickstop awk -v OFS="\t" '{if($7>$8) {print $1,$2,$3,$4,$5,$6,$8,$7,$9,$10,$11,$12,$13,$14}else{print$0}}' temp.bed > temp2.bed #correct illegal coordinates awk -v OFS="\t" '{if($7<$2) {print $1,$2,$3,$4,$5,$6,$2,$8,$9,$10,$11,$12,$13,$14}else{print$0}}' temp2.bed > temp.bed awk -v OFS="\t" '{if($7>$3) {print $1,$2,$3,$4,$5,$6,"0","0",$9,$10,$11,$12,$13,$14}else{print$0}}' temp.bed > temp2.bed awk -v OFS="\t" '{if($8>$3) {print $1,$2,$3,$4,$5,$6,$7,$3,$9,$10,$11,$12,$13,$14}else{print$0}}' temp2.bed > temp.bed awk -v OFS="\t" '{if($8<$2) {print $1,$2,$3,$4,$5,$6,"0","0",$9,$10,$11,$12,$13,$14}else{print$0}}' temp.bed > temp2.bed bedToBigBed temp2.bed ../hg38reps/hg38reps.sizes RepBro_knownGene.bb -type=bed12+2 ##########trying to figure out what is wrong with thickstarts and liftOvers bedToExons knownGene32_12.bed knownGene32_exons.bed awk '$7 == $8' knownGene32_12.bed > ncRNA_knownGene32_temp.bed uniqueToFileOne.sh knownGene32_12.bed ncRNA_knownGene32_temp.bed | tail -n+2 > cds_knownGene32_12.bed bedToExons ncRNA_knownGene32_temp.bed ncRNA_knownGene32_exons.bed bedToExons -cdsOnly cds_knownGene32_12.bed temp.bed bedToExons cds_knownGene32_12.bed knownGene32_exons.bed awk '$2 != $3' temp.bed > cds_knownGene32_exons.bed bedtools subtract -a knownGene32_exons.bed -b cds_knownGene32_exons.bed > utr_knownGene32_exons.bed liftOver ncRNA_knownGene32_exons.bed ../lift/hg38_to_hg38reps.over.chain RepBro_ncRNA_knownGene32_12.bed RepBro_ncRNA_knownGene32_12.unmapped -multiple liftOver cds_knownGene32_exons.bed ../lift/hg38_to_hg38reps.over.chain RepBro_cds_knownGene32_exons.bed RepBro_cds_knownGene32_exons.unmapped -multiple liftOver utr_knownGene32_exons.bed ../lift/hg38_to_hg38reps.over.chain RepBro_utr_knownGene32_exons.bed RepBro_utr_knownGene32_exons.unmapped -multiple awk -v OFS="\t" '{$7=$2;$8=$2;$9=0; print $0}' RepBro_ncRNA_knownGene32_12.bed > t1.bed awk -v OFS="\t" '{$7=$2;$8=$3;$9=0; print $0}' RepBro_cds_knownGene32_exons.bed > t2.bed awk -v OFS="\t" '{$7=$2;$8=$2;$9=0; print $0}' RepBro_utr_knownGene32_exons.bed > t3.bed join -1 4 -2 1 <(sort -k4 RepBro_ncRNA_knownGene32_12.bed) <(sort -k1 knownGene32_keyfile.txt | cut -f 1,2,7) | awk -v OFS="\t" '{print $2,$3,$4,$8,"0", $6,$3,$3,$1,$7}' | sort | uniq | bedtools sort > RepBro_ncRNA_knownGene32_14.bed join -1 4 -2 1 <(sort -k4 RepBro_cds_knownGene32_exons.bed) <(sort -k1 knownGene32_keyfile.txt | cut -f 1,2,7) | awk -v OFS="\t" '{print $2,$3,$4,$8,"0", $6,$3,$4,$1,$7}'| sort | uniq | bedtools sort > RepBro_cds_knownGene32_exons_14.bed join -1 4 -2 1 <(sort -k4 RepBro_utr_knownGene32_exons.bed) <(sort -k1 knownGene32_keyfile.txt | cut -f 1,2,7) | awk -v OFS="\t" '{print $2,$3,$4,$8,"0", $6,$3,$3,$1,$7}' | sort | uniq | bedtools sort > RepBro_utr_knownGene32_exons_14.bed bedtools groupby -g 1,2,3,4,5,6,7,8 -o collapse -c 9,10 -i RepBro_cds_knownGene32_exons_14.bed > collapse_RepBro_cds_knownGene32_exons_14.bed bedtools groupby -g 1,2,3,4,5,6,7,8 -o collapse -c 9,10 -i RepBro_ncRNA_knownGene32_14.bed > collapse_RepBro_ncRNA_knownGene32_14.bed bedtools groupby -g 1,2,3,4,5,6,7,8 -o collapse -c 9,10 -i RepBro_utr_knownGene32_exons_14.bed > collapse_RepBro_utr_knownGene32_exons_14.bed bedToBigBed collapse_RepBro_cds_knownGene32_exons_14.bed ../hg38reps/hg38reps.sizes gencode_cds.bb -type=bed8+2 bedToBigBed collapse_RepBro_ncRNA_knownGene32_14.bed ../hg38reps/hg38reps.sizes gencode_ncRNA.bb -type=bed8+2 bedToBigBed collapse_RepBro_utr_knownGene32_exons_14.bed ../hg38reps/hg38reps.sizes gencode_utr.bb -type=bed8+2 ########### join -1 9 -2 1 <(sort -k9 collapse_RepBro_cds_knownGene32_exons_14.bed) <(sort -k1 GENCODE_v32_knownAttrs.tab) | cut -f 12 -d" " join -1 9 -2 1 <(sort -k9 collapse_RepBro_ncRNA_knownGene32_14.bed) <(sort -k1 GENCODE_v32_knownAttrs.tab) | cut -f 12 -d" " join -1 9 -2 1 <(sort -k9 collapse_RepBro_utr_knownGene32_exons_14.bed) <(sort -k1 GENCODE_v32_knownAttrs.tab)| cut -f 12 -d" " #cat t1.bed t2.bed t3.bed | awk -v OFS="\t" '$5="0"' | sort | uniq > t.bed #bedSort t.bed t.bed #bedtools groupby -g 1,4,6 -c 2,3,7,8 -o collapse -i t.bed > s.bed #grep "," -v s.bed | awk -v OFS="\t" '{print $1, $4, $5,$2,"0",$3,"0","0","0","1",$5-$4",", "0,"}'> single_exon_lift.bed #grep "," s.bed | awk -v OFS="\t" '{ {split($4, a, ",")} {split($5, b, ",")} {print $1, a[1],b[length(b)], $2, "0", $3, $6, $7,"0", length(b), b[length(b)] - a[1]",", $4, $5 }}'> multi_exon_lift.bed #cat multi_exon_lift.bed | awk -v OFS="\t" '{{split($7, a, ",")} {split($8, b, ",")} {thick_s=0} {for (i in a){if(thick_s == 0 || (a[i] < thick_s && a[i] !=0)) {thick_s=a[i]} } } {thick_e=0} {for (i in b){if(thick_e < b[i]) {thick_e=b[i]} } } {print $1,$2,$3,$4,$5,$6,thick_s,thick_e, $9,$10,$11,$12, $13} }' >temp.bed #cat temp.bed | awk -v OFS="\t" '{{split($12, a, ",")} {split($13, b, ",")} {i_s=""} {for (i in a){ i_s = i_s""(b[i]-a[i])"," } } {i_b=""} {for (i in a){t = (a[i]-$2); i_b=i_b""t","} } {print $1,$2,$3,$4,$5,$6,$7,$8, $9,$10, i_s, i_b } }' > temp2.bed #cat temp2.bed single_exon_lift.bed | bedtools sort > known_gene_with_thicks.bed join -1 4 -2 1 <(sort -k4 known_gene_with_thicks.bed) <(sort -k1 knownGene32_keyfile.txt | cut -f 1,2,7) | awk -v OFS="\t" '{print $2,$3,$4,$14,"0", $6,$7,$8,$9,$10,$11,$12,$13,$1}' > knownGene32_exons_14.bed bedSort knownGene32_exons_14.bed knownGene32_exons_14.bed cut -f 1-12 knownGene32_exons_14.bed > test.bed sed -n '47655p' test.bed join -1 4 -2 1 <(sort -k4 knownGene32_exons.bed) <(sort -k1 knownGene32_keyfile.txt) | awk -v OFS="\t" '{print $2,$3,$4,$7,$11,$6,$9,$10,"0","1",$4-$3",","0,",$12,$1}' > knownGene32_14.bed liftOver knownGene32_exons.bed ../lift/hg38_to_hg38reps.over.chain RepBro_knownGene32_exons.bed RepBro_knownGene32_exons.unmapped -multiple join -1 4 -2 1 <(sort -k4 RepBro_knownGene32_exons.bed) <(sort -k1 GENCODE_v32_knownAttrs.tab | cut -f1,5) | awk '{print $2"\t"$3"\t"$4"\t"$7"\t0\t"$6}' | bedtools sort > RepBro_knownGene_32_rename.bed awk '{split ($4,a,"-"); {print $1"\t"$2"\t"$3"\t"a[1]"\t0\t"$6}}' temp.bed | sort | uniq | bedtools sort > temp2.bed bedtools sort -i temp2.bed | bedtools groupby -g 1,4,5,6 -c 2,3 -o min,max | awk -v OFS="\t" '{print $1,$5,$6,$2,$3,$4}' > temp.bed #bedtools groupby -g 1,4,5,6 -c 2,3 -o min,max cut -f 1-12 knownGene32_14.bed > knownGene32_12.bed awk -v OFS="\t" '{else {print $0}}' temp2.bed > temp3.bed cut -f 1-12 temp.bed > temp1.bed liftOver knownGene32_12.bed ../lift/hg38_to_hg38reps.over.chain RepBro_temp.bed RepBro_temp.unmapped -multiple -minBlocks=0.001 -fudgeThick -minMatch=0.0005 -minSizeQ=0 join -1 4 -2 1 <(sort -k4 RepBro_temp.bed) <(sort -k1 knownGene32_keyfile.txt) | awk '{print $2"\t"$3"\t"$4"\t"$13"\t0\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$1}' | bedtools sort > temp.bed ########### wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/gencode.v32.annotation.gtf.gz #########schmittges2016 # liftOver of the genes to hg38reps # # Copied bb from hg19, swap chromosome coordinates to name # Output: 160 *_hg38reps.bb files in ../hg38reps/schmittges2016 # Output: 160 *_hg38reps.bw files in ../hg38reps/schmittges2016 ### #make bb summits mkdir ../hg38reps/schmittges2016 ls ../hg19/schmittges2016/*.bb | cut -f 4 -d "/" | cut -f 1 -d"." | while read i; do bigBedToBed ../hg19/schmittges2016/${i}.bb temp.bed; awk '$4=$1":"$2"-"$3' temp.bed > test.bed; liftOver -multiple test.bed ../lift/hg19_to_hg38reps.over.chain ../hg38reps/schmittges2016/${i}_hg38reps.bed ../hg38reps/schmittges2016/${i}_hg38reps.unmapped; bedSort ../hg38reps/schmittges2016/${i}_hg38reps.bed ../hg38reps/schmittges2016/${i}_hg38reps.bed; bedToBigBed ../hg38reps/schmittges2016/${i}_hg38reps.bed ../hg38reps/hg38reps.sizes ../hg38reps/schmittges2016/${i}_hg38reps.bb; done #make coverage ls ../hg38reps/schmittges2016/*.bed | cut -f 4 -d"/" | cut -f 1 -d"." | while read i; do bedtools genomecov -bg -split -i ../hg38reps/schmittges2016/${i}.bed -g ../hg38reps/hg38reps.sizes > temp.bg; bedGraphToBigWig temp.bg ../hg38reps/hg38reps.sizes ../hg38reps/schmittges2016/${i}.bw; done #make trackdb echo -e "track schmittges2016\nshortLabel Schmittges_Hughes 2016\nlongLabel Schmittges_Hughes Zinc Finger ChIP-Seq\ngroup summitCov\ntype bigWig\ncompositeTrack on\nvisibility hide\n" > trackDb.schmittges2016.txt ls ../hg38reps/schmittges2016/*.bw | cut -f 4 -d"/" | cut -f 1 -d"."| awk '{split($1,t,"_"); print "\n\ttrack "$1"_cov\n\tparent schmittges2016\n\tshortLabel "t[1]" "t[2]"\n\tlongLabel schmittges2016 "t[1]" "t[2]"\n\tvisibility dense\n\ttype bigWig\n\tbigDataUrl schmittges2016/"$1".bw" }' >> trackDb.schmittges2016.txt echo -e "\ntrack schmittges2016Summits\nshortLabel Schmittges_Hughes 2016\nlongLabel Schmittges_Hughes Zinc Finger ChIP-Seq\ngroup summits\ntype bigBed\ncompositeTrack on\nvisibility hide\n" >> trackDb.schmittges2016.txt ls ../hg38reps/schmittges2016/*.bb | cut -f 4 -d"/" | cut -f 1 -d"."| awk '{split($1,t,"_"); print "\n\ttrack "$1"_sum\n\tparent schmittges2016Summits\n\tshortLabel "t[1]" "t[2]"\n\tlongLabel schmittges2016 "t[1]" "t[2]"\n\tvisibility dense\n\ttype bigBed\n\tbigDataUrl schmittges2016/"$1".bb" }' >> trackDb.schmittges2016.txt mv trackDb.schmittges2016.txt ../hg38reps/ #########Imbeault_Trono 2017 # liftOver of the genes to hg38reps # # # Output: 251 *_hg38reps.bb files in ../hg38reps/imbeault2017 ### #make bb summits mkdir ../hg38reps/imbeault2017 ls ../hg19/imbeault2017/*.bed | cut -f 4 -d "/" | cut -f 1 -d"." | while read i; do awk '$4=$1":"$2"-"$3' ../hg19/imbeault2017/*${i}.bed > test.bed; liftOver -multiple test.bed ../lift/hg19_to_hg38reps.over.chain ../hg38reps/imbeault2017/${i}_hg38reps.bed ../hg38reps/imbeault2017/${i}_hg38reps.unmapped; bedSort ../hg38reps/imbeault2017/${i}_hg38reps.bed ../hg38reps/imbeault2017/${i}_hg38reps.bed; bedToBigBed ../hg38reps/imbeault2017/${i}_hg38reps.bed ../hg38reps/hg38reps.sizes ../hg38reps/imbeault2017/${i}_hg38reps.bb; done #make coverage ls ../hg38reps/imbeault2017/*.bed | cut -f 4 -d"/" | cut -f 1 -d"." | while read i; do bedtools genomecov -bg -split -i ../hg38reps/imbeault2017/${i}.bed -g ../hg38reps/hg38reps.sizes > temp.bg; bedGraphToBigWig temp.bg ../hg38reps/hg38reps.sizes ../hg38reps/imbeault2017/${i}.bw; done #make trackdb echo -e "track imbeault2017\nshortLabel Imbeault_Trono 2017\nlongLabel Imbeault_Trono Zinc Finger ChIP-Seq\ngroup summitCov\ntype bigWig\ncompositeTrack on\nvisibility hide\n" > trackDb.imbeault2017.txt ls ../hg38reps/imbeault2017/*.bw | cut -f 4 -d"/" | cut -f 1 -d"."| awk '{split($1,t,"_"); print "\n\ttrack "$1"_cov\n\tparent imbeault2017\n\tshortLabel "t[1]" "t[2]"\n\tlongLabel imbeault2017 "t[1]" "t[2]"\n\tvisibility dense\n\ttype bigWig\n\tbigDataUrl imbeault2017/"$1".bw" }' >> trackDb.imbeault2017.txt echo -e "\ntrack imbeault2017Summits\nshortLabel Imbeault_Trono 2017\nlongLabel Imbeault_Trono Zinc Finger ChIP-Seq\ngroup summits\ntype bigBed\ncompositeTrack on\nvisibility hide\n" >> trackDb.imbeault2017.txt ls ../hg38reps/imbeault2017/*.bb | cut -f 4 -d"/" | cut -f 1 -d"."| awk '{split($1,t,"_"); print "\n\ttrack "$1"_sum\n\tparent imbeault2017Summits\n\tshortLabel "t[1]" "t[2]"\n\tlongLabel imbeault2017 "t[1]" "t[2]"\n\tvisibility dense\n\ttype bigBed\n\tbigDataUrl imbeault2017/"$1".bb" }' >> trackDb.imbeault2017.txt mv trackDb.imbeault2017.txt ../hg38reps/ #ls *.mapInfo | cut -f 1 -d"."| while read i; do mapInfoToBed.sh ${i}.mapInfo >${i}.bed; done #########trf # calculation of trf on hg38 # # # Output: trf.bb ### trfBig ../hg38reps/hg38reps.fa trf.out bedAt=trf.raw cat trf.tab | gawk 'BEGIN {FS="\t"; OFS="\t"} {$4=$16; if (length($4) > 256){$4=substr($4,1,206)"_truncated"}; NR=14; print}' | cut -f-15 > trf.bed bedSort trf.bed trf.bed bedToBigBed trf.bed ../hg38reps/hg38reps.sizes trf.bb -type=bed4+11 #-as= bedToBigBed -as= #########other_cons_aln # calculation of trf on hg38 # # # Output: other_cons_aln.bb ### blat ../hg38reps/hg38reps.fa ../hg38reps/hg38reps.fa other_cons_aln.psl pslToBigPsl other_cons_aln.psl stdout | sort -k1,1 -k2,2n > other_cons_aln.txt wget https://genome.ucsc.edu/goldenPath/help/examples/bigPsl.as bedSort other_cons_aln.txt other_cons_aln.txt bedToBigBed as=bigPsl.as -type=bed12+13 -tab other_cons_aln.txt ../hg38reps/hg38reps.sizes other_cons_aln.bb #########Tsankov_Meissner2014 # liftOver of the genes to hg38reps # # # Output: 204 *_hg38reps.bb files in ../hg38reps/imbeault2017 ### mkdir ../hg19/tsankov2014 mkdir ../hg38reps/tsankov2014 grep -B3 "longLabel" trackDb.tsankov2014.txt | tr "\n" "\t" | tr -s "-" "\n" | cut -f 2,5 | cut -f 5,10-16 -d" " | tr -s "_" "\t" | cut -f 1,3 | sed "s/\//-/g" | tr -s " " "_" | sed "s/\t_/\t/g" > tsankov2014.key cat tsankov2014.key | while read i g; do mv ../hg19/tsankov2014/${i}_peaks.bb ../hg19/tsankov2014/${i}_${g}_peaks.bb; done #make summits ls ../hg19/tsankov2014/*.bb | cut -f 4 -d "/" | cut -f 1 -d"." | while read i; do bigBedToBed ../hg19/tsankov2014/${i}.bb temp.bed; awk '$4=$1":"$2"-"$3' temp.bed > test.bed; liftOver -multiple test.bed ../lift/hg19_to_hg38reps.over.chain ../hg38reps/tsankov2014/${i}_hg38reps.bed ../hg38reps/tsankov2014/${i}_hg38reps.unmapped; bedSort ../hg38reps/tsankov2014/${i}_hg38reps.bed ../hg38reps/tsankov2014/${i}_hg38reps.bed; bedToBigBed ../hg38reps/tsankov2014/${i}_hg38reps.bed ../hg38reps/hg38reps.sizes ../hg38reps/tsankov2014/${i}_hg38reps.bb; done #make coverage ls ../hg38reps/tsankov2014/*.bed | cut -f 4 -d"/" | cut -f 1 -d"." | while read i; do bedtools genomecov -bg -split -i ../hg38reps/tsankov2014/${i}.bed -g ../hg38reps/hg38reps.sizes > temp.bg; bedGraphToBigWig temp.bg ../hg38reps/hg38reps.sizes ../hg38reps/tsankov2014/${i}.bw; done #make trackdb echo -e "track tsankov2014\nshortLabel Tsankov_Meissner 2014\nlongLabel Tsankov_Meissner TF ChIP-Seq in ES differentiation\ngroup summitCov\ntype bigWig\ncompositeTrack on\nvisibility hide\n" > trackDb.tsankov2014.txt ls ../hg38reps/tsankov2014/*.bw | cut -f 4 -d"/" | cut -f 1 -d"."| awk '{split($1,t,"_"); if (t[4]!="peaks") {t[3] ="$t[3] $t[4]"}; print "\n\ttrack "$1"_cov\n\tparent tsankov2014\n\tshortLabel "t[2]" "t[3]"\n\tlongLabel tsankov2014 "t[1]" "t[2]" "t[3]"\n\tvisibility dense\n\ttype bigWig\n\tbigDataUrl tsankov2014/"$1".bw" }' >> trackDb.tsankov2014.txt echo -e "\ntrack tsankov2014Summits\nshortLabel Tsankov_Meissner 2014\nlongLabel Tsankov_Meissner TF ChIP-Seq in ES differentiation\ngroup summits\ntype bigBed\ncompositeTrack on\nvisibility hide\n" >> trackDb.tsankov2014.txt ls ../hg38reps/tsankov2014/*.bb | cut -f 4 -d"/" | cut -f 1 -d"."| awk '{split($1,t,"_"); if (t[4]!="peaks") {t[3] ="$t[3] $t[4]"} ; print "\n\ttrack "$1"_sum\n\tparent tsankov2014Summits\n\tshortLabel "t[2]" "t[3]"\n\tlongLabel tsankov2014 "t[1]" "t[2]" "t[3]"\n\tvisibility dense\n\ttype bigBed\n\tbigDataUrl tsankov2014/"$1".bb" }' >> trackDb.tsankov2014.txt mv trackDb.tsankov2014.txt ../hg38reps/ #########Mappings to the Human Genome # liftOver of the genes to hg38reps # # # Output: 204 *_hg38reps.bb files in ../hg38reps/imbeault2017 ### wget -r –level=0 -E –ignore-length -x -k -p -erobots=off -np -N https://hgwdev.gi.ucsc.edu/~max/kznf/hg38reps/hg38/seqs/pslLift/ ls pslLift/*.psl | while read i; do shuf $i | head -n 500; done > merged_500.psl awk '$10=($10":"$11"-"$12)' merged_500.psl | tr -s " " "\t" > more_500.psl pslToBigPsl more_500.psl stdout | sort -k1,1 -k2,2n > more_500.txt wget https://genome.ucsc.edu/goldenPath/help/examples/bigPsl.as bedSort more_500.txt more_500.txt bedToBigBed as=bigPsl.as -type=bed12+13 -tab more_500.txt ../hg38reps/hg38reps.sizes mappings_500.bb ### http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeRegDnaseClustered/wgEncodeRegDnaseClusteredV3.bed.gz gunzip wgEncodeRegDnaseClusteredV3.bed.gz sed "s/\//-/g" encRegTfbsClusteredWithCells.hg38.bed | awk '{split ($6,a,","); for (i in a) { print ($1"\t"$2"\t"$3"\t"$4"\t"$5"\t+") >> "../hg38/encRegTfbsClustered/encRegTfbsClusteredWithCells_"$4"_"a[i]".bed"}}' #make bb summits mkdir ../hg38reps/encRegTfbsClustered ls ../hg38/encRegTfbsClustered/*.bed | cut -f 4 -d "/" | cut -f 1 -d"." | while read i; do awk '$4=$1":"$2"-"$3' ../hg38/encRegTfbsClustered/*${i}.bed > test.bed; liftOver -multiple test.bed ../lift/hg38_to_hg38reps.over.chain ../hg38reps/encRegTfbsClustered/${i}_hg38reps.bed ../hg38reps/encRegTfbsClustered/${i}_hg38reps.unmapped; bedSort ../hg38reps/encRegTfbsClustered/${i}_hg38reps.bed ../hg38reps/encRegTfbsClustered/${i}_hg38reps.bed; bedToBigBed ../hg38reps/encRegTfbsClustered/${i}_hg38reps.bed ../hg38reps/hg38reps.sizes ../hg38reps/encRegTfbsClustered/${i}_hg38reps.bb; done #make coverage ls ../hg38reps/encRegTfbsClustered/*.bed | cut -f 4 -d"/" | cut -f 1 -d"." | while read i; do bedtools genomecov -bg -split -i ../hg38reps/encRegTfbsClustered/${i}.bed -g ../hg38reps/hg38reps.sizes > temp.bg; bedGraphToBigWig temp.bg ../hg38reps/hg38reps.sizes ../hg38reps/encRegTfbsClustered/${i}.bw; done #make trackdb echo -e "track ENCODE_TFBS\nshortLabel ENCODE_TFBS\nlongLabel ENCODE TFBS Clustered\ngroup summitCov\ntype bigWig\ncompositeTrack on\nvisibility hide\n" > trackDb.encTFBSclustered.txt ls ../hg38reps/encRegTfbsClustered/*.bw | cut -f 4 -d"/" | cut -f 1 -d"."| awk '{split($1,t,"_"); print "\n\ttrack "$1"_cov\n\tparent ENCODE_TFBS\n\tshortLabel "t[2]" "t[3]"\n\tlongLabel ENCODE TFBS Clustered "t[2]" "t[3]"\n\tvisibility dense\n\ttype bigWig\n\tbigDataUrl encRegTfbsClustered/"$1".bw" }' >> trackDb.encTFBSclustered.txt echo -e "\ntrack ENCODE_TFBSSummits\nENCODE_TFBS 2017\nlongLabel Imbeault_Trono Zinc Finger ChIP-Seq\ngroup summits\ntype bigBed\ncompositeTrack on\nvisibility hide\n" >> trackDb.encTFBSclustered.txt ls ../hg38reps/encRegTfbsClustered/*.bb | cut -f 4 -d"/" | cut -f 1 -d"."| awk '{split($1,t,"_"); print "\n\ttrack "$1"_sum\n\tparent ENCODE_TFBS_Summits\n\tshortLabel "t[2]" "t[3]"\n\tlongLabel iENCODE TFBS Clustered "t[2]" "t[3]"\n\tvisibility dense\n\ttype bigBed\n\tbigDataUrl encRegTfbsClustered/"$1".bb" }' >> trackDb.encTFBSclustered.txt mv trackDb.imbeault2017.txt ../hg38reps/ #### # #Meta summits ### ls ../hg38reps/imbeault2017/*.bed | cut -f 4 -d "/" | cut -f 1 -d"." | while read i; do macs2 callpeak -t ../hg38reps/imbeault2017/${i}.bed --keep-dup all --outdir macs2 --nomodel --call-summits -n ${i}; done cd macs2 cat *_summits* | awk '$5 > 100'| sed "s/_Trono_hg38reps_peak_.*\t/\t/g" | sort -k5 -n > imbeault2017_summit_summary.bed bedSort imbeault2017_summit_summary.bed imbeault2017_summit_summary.bed bedtools slop -i imbeault2017_summit_summary.bed -g ../../hg38reps/hg38reps.sizes -b 10 > temp.bed mv temp.bed imbeault2017_summit_summary.bed ls ../hg38reps/imbeault2017/*.bed | cut -f 4 -d "/" | cut -f 1 -d"." | while read i; do macs2 callpeak -t ../hg38reps/imbeault2017/${i}.bed --keep-dup all --outdir macs2 --nomodel --call-summits -n ${i}; done awk '{if($5>1000){$5=1000}}{printf ("%s\t%.0f\t%.0f\t%s\t%.0f\t%s\n", $1, $2, $3, $4, $5, "+")}' imbeault2017_summit_summary.bed > temp.bed bedToBigBed temp.bed ../hg38reps/hg38reps.sizes meta_summits.bb -type=bed6 #encode DCC wget http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeAwgDnaseMasterSites/wgEncodeAwgDnaseMasterSites.bed.gz gunzip *.gz http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeAwgDnaseMasterSites/wgEncodeAwgDnaseMasterSources.tab cut -f 1-5 wgEncodeAwgDnaseMasterSites.bed > encode_DNAse_5.bed liftOver -multiple encode_DNAse_5.bed ../../lift/hg19_to_hg38reps.over.chain encode_DNAse_5_hg38reps.bed encode_DNAse_hg38reps.unmapped mv encode_DNAse_5_hg38reps.bed bedSort encode_DNAse_5_hg38reps.bed encode_DNAse_5_hg38reps.bed bedtools genomecov -bg -split -i encode_DNAse_5_hg38reps.bed -g hg38reps.sizes > temp.bg bedGraphToBigWig temp.bg ../hg38reps/hg38reps.sizes encode_DNAse_5_hg38reps.bw #####theunissen # ls ../hg19/theunissen2016/*.bb | cut -f 4 -d "/" | cut -f 1 -d"." | while read i; do bigBedToBed ../hg19/theunissen2016/${i}.bb temp.bed; awk '$4=$1":"$2"-"$3' temp.bed > test.bed; liftOver -multiple test.bed ../lift/hg19_to_hg38reps.over.chain ../hg38reps/theunissen2016/${i}_hg38reps.bed ../hg38reps/theunissen2016/${i}_hg38reps.unmapped; bedSort ../hg38reps/theunissen2016/${i}_hg38reps.bed ../hg38reps/theunissen2016/${i}_hg38reps.bed; bedToBigBed ../hg38reps/theunissen2016/${i}_hg38reps.bed ../hg38reps/hg38reps.sizes ../hg38reps/theunissen2016/${i}_hg38reps.bb; done #make coverage ls ../hg38reps/theunissen2016/*.bed | cut -f 4 -d"/" | cut -f 1 -d"." | while read i; do bedtools genomecov -bg -split -i ../hg38reps/theunissen2016/${i}.bed -g ../hg38reps/hg38reps.sizes > temp.bg; bedGraphToBigWig temp.bg ../hg38reps/hg38reps.sizes ../hg38reps/theunissen2016/${i}.bw; done #make trackdb echo -e "track theunissen2016\nshortLabel theunissen2016S\nlongLabel theunissen 2016 Primed Naive\ngroup summitCov\ntype bigWig\ncompositeTrack on\nvisibility hide\n" > trackDb.theunissen2016.txt ls ../hg38reps/theunissen2016/*.bw | cut -f 4 -d"/" | cut -f 1 -d"."| awk '{split($1,t,"_"); print "\n\ttrack "$1"_cov\n\tparent theunissen2016\n\tshortLabel "t[2]" "t[3]"\n\tlongLabel theunissen2016 Primed Naive "t[2]" "t[3]"\n\tvisibility dense\n\ttype bigWig\n\tbigDataUrl theunissen2016/"$1".bw" }' >> trackDb.theunissen2016.txt echo -e "\ntrack theunissen2016Summits\nshortLabel theunissen2016Summits\nlongLabel theunissen 2016 Primed Naive\ngroup summits\ntype bigBed\ncompositeTrack on\nvisibility hide\n" >> trackDb.theunissen2016.txt ls ../hg38reps/theunissen2016/*.bb | cut -f 4 -d"/" | cut -f 1 -d"."| awk '{split($1,t,"_"); print "\n\ttrack "$1"_sum\n\tparent theunissen2016Summits\n\tshortLabel "t[2]" "t[3]"\n\tlongLabel theunissen2016 Primed Naive "t[2]" "t[3]"\n\tvisibility dense\n\ttype bigBed\n\tbigDataUrl theunissen2016/"$1".bb" }' >> trackDb.theunissen2016.txt mv trackDb.theunissen2016.txt ../hg38reps/ ##encode histonr http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeUwHistone/ rsync -a -P rsync://hgdownload.soe.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeUwHistone/*broadPeak* ./ ls | while read i; do awk -v OFS="\t" '{if ($8 > 10){print $1,$2,$3, "peak_"NR, $5, $6}}' $i > temp.bed; cat temp.bed > $i; done ls | while read i; do awk '$4=$1":"$2"-"$3' $i > test.bed; liftOver -multiple test.bed ../../lift/hg19_to_hg38reps.over.chain ../../hg38reps/wgEncodeUwHistone/${i}_hg38reps.bed ../../hg38reps/wgEncodeUwHistone/${i}_hg38reps.unmapped; bedSort ../../hg38reps/wgEncodeUwHistone/${i}_hg38reps.bed ../../hg38reps/wgEncodeUwHistone/${i}_hg38reps.bed; bedToBigBed ../../hg38reps/wgEncodeUwHistone/${i}_hg38reps.bed ../../hg38reps/hg38reps.sizes ../../hg38reps/wgEncodeUwHistone/${i}_hg38reps.bb; done ls | while read i; do bigBedToBed ../hg19/theunissen2016/${i}.bb temp.bed; awk '$4=$1":"$2"-"$3' temp.bed > test.bed; liftOver -multiple test.bed ../lift/hg19_to_hg38reps.over.chain ../hg38reps/theunissen2016/${i}_hg38reps.bed ../hg38reps/theunissen2016/${i}_hg38reps.unmapped; bedSort ../hg38reps/theunissen2016/${i}_hg38reps.bed ../hg38reps/theunissen2016/${i}_hg38reps.bed; bedToBigBed ../hg38reps/theunissen2016/${i}_hg38reps.bed ../hg38reps/hg38reps.sizes ../hg38reps/theunissen2016/${i}_hg38reps.bb; done