#!/usr/bin/env python3
import sys
import os.path as osp
import glob
from pycbio.tsv import TsvReader
from pycbio.sys import fileOps
import re
import pipettor
from zooCommon import loadZooEquiv, getZooGenArkAcc, getZooGenArkRec, GenomeMissingError

class AliasBuilderException(Exception):
    pass

class GenArkDoesNotExist(Exception):
    def __init__(self, acc, path):
        super().__init__(f"{acc} {path}")

class NoGenArkIdKeys(AliasBuilderException):
    pass

class NotAccession(AliasBuilderException):
    pass

class BadSciName(AliasBuilderException):
    pass

def accToPathParts(acc):
    m = re.search("^([A-Z]{3})_([0-9]{3})([0-9]{3})([0-9]{3})\\.[0-9]+$", acc)
    if m is None:
        raise NotAccession(acc)
    return m.groups()

def accToGenArkHubDir(acc):
    # /hive/data/genomes/asmHubs/GCF/003/709/585/GCF_003709585.1/
    accParts = accToPathParts(acc)
    path = osp.join("/hive/data/genomes/asmHubs", *accParts, acc)
    if not osp.exist(path):
        raise GenArkDoesNotExist(acc, path)
    return path

def accToGenArkBuildDir(acc):
    # /hive/data/genomes/asmHubs/genbankBuild/GCA/000/001/905/GCA_000001905.1_Loxafr3.0/
    # /hive/data/genomes/asmHubs/refseqBuild/GCF/000/001/735/GCF_000001735.4_TAIR10.1/
    accParts = accToPathParts(acc)
    srcDir = "genbankBuild" if accParts[0] == "GCA" else "refseqBuild"
    # need to get the stupid name (GCA_000001905.1_Loxafr3.0) from directory name
    rootDir = osp.join("/hive/data/genomes/asmHubs", srcDir, *accParts)
    asmDirs = glob.glob(osp.join(rootDir, acc + "_*"))
    if len(asmDirs) == 0:
        raise GenArkDoesNotExist(acc, rootDir)
    elif len(asmDirs) > 1:
        raise AliasBuilderException(f"multiple stupid directories for {acc}: " + " ".join(asmDirs))
    return osp.join(rootDir, asmDirs[0])

def getGenArkIdKeys(acc):
    # .../trackData/idKeys/GCA_004363605.1_IndInd_v1_BIUU.idKeys.txt
    # sometimes not in trackData dir, check both
    buildDir = accToGenArkBuildDir(acc)
    idKeys = osp.join(buildDir, "trackData/idKeys", osp.basename(buildDir) + ".idKeys.txt")
    if not osp.exists(idKeys):
        idKeys = osp.join(buildDir, "idKeys", osp.basename(buildDir) + ".idKeys.txt")
    if not osp.exists(idKeys):
        raise GenArkDoesNotExist(acc, idKeys)
    return idKeys

def getHubIdsKeys(genome):
    return f"../241-mammalian-2020v2-hub/{genome}/{genome}.idKeys.txt"

def getHubChromSizes(genome):
    return f"../241-mammalian-2020v2-hub/{genome}/chrom.sizes"

def countLines(path):
    out = pipettor.runout(["wc", "-l", path])
    return int(out.split()[0])

def getOutChromAliasPath(zooRec):
    return osp.join("output", zooRec.Hub_Species, zooRec.Hub_Species + ".chromAlias.txt")

def joinIdKeys(zooRec, genArkAcc, hubIdKeys, genArkIdsKeys):
    outChromAlias = getOutChromAliasPath(zooRec)
    outChromAliasTmp = fileOps.atomicTmpFile(outChromAlias)

    header = "# zoonomiaHub\tgenArk"
    joinCmd = [["join", "-t", "\t", hubIdKeys, genArkIdsKeys],
               ["cut", "-f2-3"]]
    with open(outChromAliasTmp, 'w') as fh:
        print(header, file=fh, flush=True)
        pipettor.run(joinCmd, stdout=fh)

    hubSeqCnt = countLines(hubIdKeys)
    genArkCnt = countLines(genArkIdsKeys)

    frac = genArkCnt / hubSeqCnt
    print("    idKeys", zooRec.Hub_Species, hubSeqCnt, genArkAcc, genArkCnt, frac, file=sys.stderr)
    # Bison_bison is refseq in genark and genbank in zoonomia, however this is okay, since it
    if (frac < 0.90) and (zooRec.Hub_Species not in ("Bison_bison",)):
        raise AliasBuilderException(f"{zooRec.Hub_Species} poor coverage for chromAlias: {genArkCnt}/{hubSeqCnt}")
    fileOps.atomicInstall(outChromAliasTmp, outChromAlias)

def findGenArkIdsKeys(zooToGenArk, zooRec):
    # first try accession in zoo spreadsheet
    caPathZoo = getGenArkIdKeys(zooRec.Accession)
    if osp.exists(caPathZoo):
        print("      using zoo acc:", caPathZoo, file=sys.stderr, flush=True)
        return caPathZoo
    if zooRec.Species != zooRec.Hub_Species:
        raise BadSciName(str(zooRec))
    caPathGen = getGenArkIdKeys(getZooGenArkAcc(zooToGenArk, zooRec.Species))
    if osp.exists(caPathGen):
        print("      using genark acc:", caPathGen, file=sys.stderr, flush=True)
        return caPathGen
    raise NoGenArkIdKeys(str(zooRec))

def doProcessZoonomia(zooToGenArk, zooRec, outPath):
    genArkIdsKeys = findGenArkIdsKeys(zooToGenArk, zooRec)
    hubIdKeys = getHubIdsKeys(zooRec.Hub_Species)
    joinIdKeys(zooRec, getZooGenArkAcc(zooToGenArk, zooRec.Hub_Species), hubIdKeys, genArkIdsKeys)

def prInfo(zooToGenArk, zooRec):
    try:
        genArkRec = getZooGenArkRec(zooToGenArk, zooRec.Species)
    except GenomeMissingError:
        genArkRec = None
    print("Processing:", str(zooRec), file=sys.stderr, flush=True)
    print("    GenArk:", str(genArkRec), file=sys.stderr, flush=True)

def processZoonomia(zooToGenArk, zooRec):
    outPath = getOutChromAliasPath(zooRec)
    if osp.exists(outPath):
        print("Exists:", str(zooRec), file=sys.stderr, flush=True)
    else:
        try:
            prInfo(zooToGenArk, zooRec)
            doProcessZoonomia(zooToGenArk, zooRec, outPath)
        except (AliasBuilderException, FileNotFoundError) as ex:
            print("    Failed:", zooRec.Hub_Species, zooRec.Accession, ex, file=sys.stderr, flush=True)

def main():
    zooToGenArk = loadZooEquiv()
    for zooRec in TsvReader("../241-mammalian-2020v2-hub/zoonomia-genbank.tsv"):
        processZoonomia(zooToGenArk, zooRec)

main()
