#!/usr/bin/env bash

# Get the variant data
wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi

# Get the reference
REF_URL=$(zcat ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz | head -n 100 | grep "##reference" | cut -f2 -d"=")
# Turns out to be ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz
wget ${REF_URL}
gunzip hs37d5.fa.gz

# Make the variation graph reference, with info about alleles
/usr/bin/time -v ./vg construct -r hs37d5.fa -v ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz -p -R 21 -C -a > chr21_Dec_3.vg

/usr/bin/time -v ./vg index -G chr21_Dec_3.gbwt -v ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz -o -p chr21_Dec_3.vg 2>&1 | tee xg_index_21g.log & PIDXG=$!
/usr/bin/time -v ./vg index -s -x chr21_Dec_3.xg -v ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz -o -p chr21_Dec_3.vg 2>&1 | tee xg_index_21x.log & PIDGBWT=$!

wait $PIDGBWT
cat gbwt_index_21g.log | mailx -s "gbwt index done" yohei@ucsc.edu
wait $PIDXG
cat xg_index_21x.log | mailx -s "xg index done" yohei@ucsc.edu


