From 90cc622f5932ca029684f4350a3b6adab57b7342 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Wed, 7 Feb 2024 16:12:58 +0100 Subject: [PATCH 01/11] Update README.md --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4ee10d3..1831d96 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,16 @@ Records id **must follow this pattern** : `<haplotype name>#<ctg|chr name>`. (`C Because of the clustering step, this pattern is only needed for the reference assembly. Input files should be read only to prevent snakemake to mess with them (which seems to happen in some rare cases). +# Download apptainer images +Before running the worflow, some apptainer images needs to be downloaded : +``` +apptainer build <your_apptainer_image_directory>/PanGeTools.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/pangetools:latest +apptainer build <your_apptainer_image_directory>/PyTools.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/pytools:latest +apptainer build <your_apptainer_image_directory>/pggb.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangratools/pggb:latest +``` + # Usage Clone this repo and create `data/haplotypes`. Place all your haplotypes in it. -Change the reference name and the apptainer image path in `config.yaml`. +Change the reference name and the apptainer image directory in `config.yaml`. Finally, change variables in `runSnakemake.sh` to match your needs (threads, memory, job name, mail, etc...). Go in the root directory of the repo and run `sbatch runSnakemake.sh` ! \ No newline at end of file -- GitLab From f9033dc679eefc9250049e741d70138040d2d1ec Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Wed, 7 Feb 2024 16:51:16 +0100 Subject: [PATCH 02/11] Cleaning and clarity improvements Removed deprecated scripts Added flags instead of positionnal arguments Removed deprecated rules in Snakefile --- Snakefile | 29 ++++---------- scripts/bin_split.py | 80 ------------------------------------- scripts/getPanachePAV.sh | 52 ++++++++++++++---------- scripts/statsAggregation.py | 18 ++++++--- 4 files changed, 52 insertions(+), 127 deletions(-) delete mode 100644 scripts/bin_split.py diff --git a/Snakefile b/Snakefile index a2eef6f..04c67ed 100644 --- a/Snakefile +++ b/Snakefile @@ -13,22 +13,6 @@ rule all: "output/figures", "output/pav" -rule haplotypes_aggregation: - # Concatenate every haplotype into a single, compressed, fasta - input: - expand("data/haplotypes/{sample}.fa.gz", sample=SAMPLES) - output: - "data/input_haplotypes/aggregated.fa.gz" - threads: workflow.cores - params: - apppath=config["app.path"] - shell: - """ - zcat {input} > $(dirname {output})/aggregated.fa - apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ - -@ {threads} $(dirname {output})/aggregated.fa - """ - rule samtools_index: # Using samtools faidx to index compressed fasta input: @@ -67,7 +51,7 @@ rule ragtag_scaffolding: """ checkpoint clustering: - # Reading alignment file to create bins for each chromosome + # Read alignment file to create bins for each chromosome input: expand('data/ragtag_hap/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) output: @@ -85,7 +69,7 @@ checkpoint clustering: """ rule pggb_on_chr: - # Running pggb on a specific chromosome + # Runs pggb on a specific chromosome input: fa="data/chrInputs/{chromosome}.fa.gz", gzi="data/chrInputs/{chromosome}.fa.gz.gzi" @@ -109,7 +93,7 @@ rule pggb_on_chr: """ def availGraphs(wildcards): - # Getting the list of output from 'clustering' and retunring the list of corresponding graphs + # Get the list of output from 'clustering' and returns the list of corresponding graphs # This triggers snakemake to run pggb on every chromosomes available checkpoint_output = checkpoints.clustering.get(**wildcards).output[0] return expand( @@ -163,6 +147,7 @@ rule graph_stats: shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif stats -S -t {threads} -P -i {graph} > {output}/$(basename {graph} .gfa).stats.tsv") rule graph_figs: + # Creating figures using odgi viz input: availGraphs output: @@ -179,7 +164,7 @@ rule graph_figs: shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif viz -i {graph} -o {output}/$(basename {graph} .gfa).pcov.png {params.pcov}") rule aggregate_stats: - # Reading and merging all stats files into a final one + # Reading and merging all stats files into a final one called aggregatedStats.tsv input: "output/stats/" output: @@ -188,7 +173,7 @@ rule aggregate_stats: "python scripts/statsAggregation.py --input {input} --output {output}" rule get_pav: - # Create PAV matrix readable by panache + # Create PAV matrix readable by panache for a given chromosome scale graph input: availGraphs output: @@ -199,4 +184,4 @@ rule get_pav: run: shell("mkdir {output}") for graph in input: - shell("bash scripts/getPanachePAV.sh {graph} data/chrGraphs/$(basename {graph} .gfa) {output}/$(basename {graph} .gfa).pav.matrix.tsv {params.apppath}") + shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.apppath} -t {threads}") diff --git a/scripts/bin_split.py b/scripts/bin_split.py deleted file mode 100644 index 2cd3999..0000000 --- a/scripts/bin_split.py +++ /dev/null @@ -1,80 +0,0 @@ -from Bio import SeqIO -import pandas as pd -import argparse -import gzip -import os - -arg_parser = argparse.ArgumentParser(description='Bin splitter') -arg_parser.add_argument( - "--paf", - "-p", - dest = "paf", - required = True, - help = "Paf file" - ) -arg_parser.add_argument( - "--dir", - "-d", - dest = "dir", - required = True, - help = "Output directory" - ) -arg_parser.add_argument( - "--fasta", - "-f", - dest = "fasta", - required = True, - help = "Aggregated fasta" - ) -args = arg_parser.parse_args() - - -df = pd.read_csv( - args.paf, - sep = '\t', - header = None, - usecols = [0, 5, 11], - index_col = False, - names = ["ctg", "chr", "qual"] - ) - -chrList = df.chr.unique() -chrBins = {name:[] for name in chrList} - -# Getting the best mapping chromosome for each contig -for chrm in chrList: - subdf = df[df.chr == chrm] - - for ctg in subdf.ctg.unique(): - ctgdf = subdf[subdf.ctg == ctg].copy() - ctgdf.sort_values( - by = ["qual"], - inplace = True, - ascending= False - ) - - chrBins[ctgdf.iloc[0].chr].append(ctg) - -# Dictionnary linking contig name to its corresponding chromosome name -reverseDict = { - ctg: chrm - for chrm in chrBins.keys() - for ctg in chrBins[chrm] - } - -# Adding sequences to correct chromosome bins -chrSeq = {name:[] for name in chrList} -with gzip.open(args.fasta, "rt") as handle: - sequences = SeqIO.parse(handle, "fasta") - - for record in sequences: - try : - chrSeq[reverseDict[record.id]].append(record) - except : - pass - # The contig has not been mapped to a chromosome (weird) - -# Writing chromosome bin specific fasta files -for outname in chrSeq.keys(): - with open(os.path.join(args.dir, f"{outname}.fa"), "w") as output_handle: - SeqIO.write(chrSeq[outname], output_handle, "fasta") \ No newline at end of file diff --git a/scripts/getPanachePAV.sh b/scripts/getPanachePAV.sh index d3310ae..6584e87 100644 --- a/scripts/getPanachePAV.sh +++ b/scripts/getPanachePAV.sh @@ -1,38 +1,50 @@ #!/bin/bash -# Get a GFA and return a Presence/Absence Variants readable by Panache +# Get a GFA and returns a Presence/Absence Variants matrix readable by Panache # Panache apptainer image : oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/panache:latest -##Â Inputs -# $1 = GFA path -# $2 = Chromosome directory path (path used by pggb to store files like .paf) -# $3 = Output path -# $4 = Apptainer path to PanGeTools +# Initializing arguments +gfa="" # GFA path +appdir="" # Directory containing apptainer images +threads="" # Threads count +chrdir="" # Chromosome directory (directory used by pggb to store step files like .paf, etc...) +output="" # Output pav + +## Getting arguments +while getopts "g:a:t:d:o:" option; do + case "$option" in + g) gfa="$OPTARG";; + a) appdir="$OPTARG";; + t) threads="$OPTARG";; + d) chrdir="$OPTARG";; + o) output="$OPTARG";; + \?) echo "Usage: $0 [-g gfa] [-a appdir] [-t threads] [-d chrdir] [-o output]" >&2 + exit 1;; + esac +done # Getting chromosome name -chrname=$(basename $1 .gfa) +chrname=$(basename ${gfa} .gfa) # Getting paths in chromosome graph -apptainer run --app odgi "$4/PanGeTools.sif" paths -i $1 -L > $2/$chrname.paths.txt +apptainer run --app odgi "${appdir}/PanGeTools.sif" paths -i ${gfa} -L > ${chrdir}/$chrname.paths.txt # Creating bed using odgi untangle -apptainer run --app odgi "$4/PanGeTools.sif" untangle -i $1 -R $2/$chrname.paths.txt | sed '1d' | \ - cut -f 4,5,6 | sort | uniq > $2/$chrname.untangle.multiref.bed +apptainer run --app odgi "${appdir}/PanGeTools.sif" untangle -i ${gfa} -R ${chrdir}/$chrname.paths.txt -t $threads | sed '1d' | \ + cut -f 4,5,6 | sort | uniq > ${chrdir}/$chrname.untangle.multiref.bed # Running odgi pav -apptainer run --app odgi "$4/PanGeTools.sif" pav -i $1 -b $2/$chrname.untangle.multiref.bed -M \ - -B 0.5 > $2/$chrname.untangle.multiref.pavs.matrix.tsv +apptainer run --app odgi "${appdir}/PanGeTools.sif" pav -i ${gfa} -b ${chrdir}/$chrname.untangle.multiref.bed -M \ + -B 0.5 -t $threads > ${chrdir}/$chrname.untangle.multiref.pavs.matrix.tsv # Adding empty column for Panache awk -F'\t' 'BEGIN {OFS="\t"} NR==1 {print $1, $2, $3, $4, "temp", "temp", $5, $6, $7} NR>1 {print $1, $2, $3, $4, ".", ".", $5, $6, $7}' \ - $2/$chrname.untangle.multiref.pavs.matrix.tsv > $2/temp1.tsv && \ - mv $2/temp1.tsv $2/$chrname.untangle.multiref.pavs.matrix.tsv + ${chrdir}/$chrname.untangle.multiref.pavs.matrix.tsv > ${chrdir}/temp1.tsv && \ + mv ${chrdir}/temp1.tsv ${chrdir}/$chrname.untangle.multiref.pavs.matrix.tsv # Renaming columns for Panache -awk -F'\t' 'BEGIN {FS=OFS="\t"} NR==1 {sub($1, "#Chromosome"); sub($2, "FeatureStart"); sub($3, "FeatureStop"); sub($4, "Sequence_IUPAC_Plus"); sub($5, "SimilarBlocks"); sub($6, "Function")} {print}' \ - $2/$chrname.untangle.multiref.pavs.matrix.tsv > $2/temp2.tsv && \ - mv $2/temp2.tsv $2/$chrname.untangle.multiref.pavs.matrix.tsv +awk -F'\t' 'BEGIN {FS=OFS="\t"} NR==1 {sub(${gfa}, "#Chromosome"); sub(${chrdir}, "FeatureStart"); sub(${output}, "FeatureStop"); sub(${appdir}, "Sequence_IUPAC_Plus"); sub($5, "SimilarBlocks"); sub($6, "Function")} {print}' \ + ${chrdir}/$chrname.untangle.multiref.pavs.matrix.tsv > ${chrdir}/temp2.tsv && \ + mv ${chrdir}/temp2.tsv ${chrdir}/$chrname.untangle.multiref.pavs.matrix.tsv # Moving the pav file to desired output -mv $2/$chrname.untangle.multiref.pavs.matrix.tsv $3 - - +mv ${chrdir}/$chrname.untangle.multiref.pavs.matrix.tsv ${output} diff --git a/scripts/statsAggregation.py b/scripts/statsAggregation.py index 0be68bc..c60fe94 100644 --- a/scripts/statsAggregation.py +++ b/scripts/statsAggregation.py @@ -1,16 +1,23 @@ +""" +Statistics aggregator for Pan1c workflow + +Aggregate chromosome level graph statistics into a single TSV. + +@author: alexis.mergez@inrae.fr +@version: 1.0 +""" + import os -from Bio import SeqIO -import gzip -import pandas as pd import argparse -arg_parser = argparse.ArgumentParser(description='Statistics aggregator') +## Arguments +arg_parser = argparse.ArgumentParser(description='Statistics aggregator for Pan1c workflow') arg_parser.add_argument( "--input", "-i", dest = "inputDir", required = True, - help = "Input directory" + help = "Input directory containing chromosome scale stats" ) arg_parser.add_argument( "--output", @@ -21,6 +28,7 @@ arg_parser.add_argument( ) args = arg_parser.parse_args() +## Main script lines = [] # Iterating over stats files -- GitLab From 4d2fdf225413a1077a32d26d2979f0dbce53b51b Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Wed, 7 Feb 2024 17:59:29 +0100 Subject: [PATCH 03/11] Create timeStats.py Added timeStats.py to compute stats on pggb time logs (time cpu usage and memory used) [WIP] --- scripts/timeStats.py | 121 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 scripts/timeStats.py diff --git a/scripts/timeStats.py b/scripts/timeStats.py new file mode 100644 index 0000000..569f4b6 --- /dev/null +++ b/scripts/timeStats.py @@ -0,0 +1,121 @@ +""" +Time stats script for Pan1c workflow + +Given one or more tarball containing time logs of the pggb rule of the workflow, +the script returns an aggregated table containing descriptive stats + +@author: alexis.mergez@inrae.fr +@version: 1.0 +""" + +from Bio import SeqIO +import pandas as pd +import numpy as np +import argparse +import gzip +import tarfile +import os + +## Arguments +arg_parser = argparse.ArgumentParser(description='Pan1c time stats script') +arg_parser.add_argument( + "--input", + "-i", + nargs="+", + dest = "tarballs", + required = True, + help = "Tarballs containing time logs" + ) +arg_parser.add_argument( + "--fastadir", + "-f", + dest = "fastadir", + required = True, + help = "Directory containing fastas" + ) +arg_parser.add_argument( + "--output", + "-o", + dest = "output", + required = True, + help = "Output table (.tsv)" + ) +arg_parser.add_argument( + "--debug", + "-d", + action="store_true", + dest = "debug", + help = "Show debug" + ) +args = arg_parser.parse_args() + +## Toolbox + +## Main script +aggregatedData = {} + +# Iterating through tarballs +for tarball in args.tarballs: + + # Iterating in given tarball files + with tarfile.open(tarball, "r") as tar : + tarfilelist = [f for f in tar.getmembers() if f.isfile() and f.name.endswith(".time.log")] + + for file in tarfilelist: + + # Extracting content into a dataframe + handle = tar.extractfile(file) + _tmpdf = pd.read_csv( + handle, + sep=": ", + header=None, + engine='python' + ) + + if args.debug: print(f"[timeStats::debug] Current filename: {file.name}") + + # Getting Time, CPU and memory stats + time, cpu, memory = _tmpdf.iloc[4][1], int(_tmpdf.iloc[3][1][:-1]), int(_tmpdf.iloc[9][1])/100000 + chrid = (file.name).split(".")[0] + + # Convert time to pd.timedelta (handling the 2 input types) + try : + time = pd.to_timedelta(time) + except : + time = pd.to_timedelta(f"00:{time}") + + if args.debug: print(f"[timeStats::debug] Chr: {chrid}\tTime: {time}\tCPU: {cpu}%\t memory: {memory}Gb") + + # Adding to aggregatedData + aggregatedData[chrid] = {"time": time, "cpu": cpu, "mem": memory} + +# Reading fasta files to search for sequence length +chrSeqLength = {} +fastaList = [ + fasta for fasta in os.listdir(args.fastadir) + if os.path.isfile(os.path.join(args.fastadir, fasta)) + and fasta.split('.fa')[-1] == ".gz" + ] + +if args.debug: print(f"[timeStats::debug] Fasta list : {fastaList}") + +for fasta in fastaList: + with gzip.open(os.path.join(args.fastadir, fasta), "r") as handle: + lineCounts = [len(line[:-1]) for line in handle.readlines() if line[0] != ">"] + + chrName = os.path.basename(fasta).split(".")[0] + chrSeqLength[chrName] = np.sum(lineCounts) + + if args.debug: print(f"[timeStats::debug] Input fasta - {chrName} : {chrSeqLength[chrName]} bases") + + + + + + + + + + + + -- GitLab From eeeae73172ac5c8d5868c79270d948264636e499 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Thu, 8 Feb 2024 14:48:50 +0100 Subject: [PATCH 04/11] Added custom names Added "name" field to customize file names Added timeStats.py to compute statistics about graph assembler --- Snakefile | 65 ++++++++++++++++++++++---- config.yaml | 1 + scripts/timeStats.py | 107 ++++++++++++++++++++++++++++++++----------- 3 files changed, 136 insertions(+), 37 deletions(-) diff --git a/Snakefile b/Snakefile index 04c67ed..295a1e0 100644 --- a/Snakefile +++ b/Snakefile @@ -8,10 +8,11 @@ nHAP = len(SAMPLES) rule all: input: - "output/pggb.unnormalized.gfa", - "output/aggregatedStats.tsv", + "output/pan1c.pggb."+config['name']+".gfa", + "output/pan1c.pggb."+config['name']+".chromGraph.stats.tsv", "output/figures", - "output/pav" + "output/pav.matrices", + "output/pan1c.pggb."+config['name']+".time.stats.tsv" rule samtools_index: # Using samtools faidx to index compressed fasta @@ -35,7 +36,7 @@ rule ragtag_scaffolding: refgzi="data/haplotypes/"+config['reference']+".gzi", fa="data/haplotypes/{haplotype}.fa.gz" output: - "data/ragtag_hap/{haplotype}.ragtagged.fa.gz" + "data/hap.ragtagged/{haplotype}.ragtagged.fa.gz" threads: 8 params: apppath=config["app.path"] @@ -53,7 +54,7 @@ rule ragtag_scaffolding: checkpoint clustering: # Read alignment file to create bins for each chromosome input: - expand('data/ragtag_hap/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) + expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) output: directory("data/chrInputs") threads: workflow.cores @@ -62,7 +63,8 @@ checkpoint clustering: shell: """ mkdir -p {output} - apptainer run {params.apppath}/pytools.sif python scripts/inputClustering.py --fasta {input} --output {output} + apptainer run {params.apppath}/pytools.sif python scripts/inputClustering.py \ + --fasta {input} --output {output} for file in {output}/*.fa; do apptainer run --app bgzip {params.apppath}/PanGeTools.sif -@ {threads} $file done @@ -119,7 +121,7 @@ rule graph_squeeze: input: "data/chrGraphs/graphsList.txt" output: - "output/pggb.unnormalized.gfa" + "output/pan1c.pggb."+config['name']+".gfa" threads: 16 params: apppath=config['app.path'] @@ -168,16 +170,19 @@ rule aggregate_stats: input: "output/stats/" output: - "output/aggregatedStats.tsv" + "output/pan1c.pggb."+config['name']+".chromGraph.stats.tsv" shell: - "python scripts/statsAggregation.py --input {input} --output {output}" + """ + apptainer run {params.apppath}/pytools.sif python scripts/statsAggregation.py \ + --input {input} --output {output}" + """ rule get_pav: # Create PAV matrix readable by panache for a given chromosome scale graph input: availGraphs output: - directory("output/pav") + directory("output/pav.matrices") threads: 1 params: apppath=config['app.path'] @@ -185,3 +190,43 @@ rule get_pav: shell("mkdir {output}") for graph in input: shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.apppath} -t {threads}") + +rule pggb_log_compression: + input: + dir="logs/pggb", + flag="output/pan1c.pggb."+config['name']+".chromGraph.stats.tsv" + output: + tar="logs/pan1c.pggb."+config['name']+".logs.tar.gz", + chrLen="logs/chrInputLen.tsv" + shell: + """ + cd {input.dir} + tar -zcvf ../../{output.tar} * + cd ../.. + + if [ -f {output.chrLen} ]; then + rm {output.chrLen} + fi + + for chrFasta in data/chrInputs/*.fa.gz; do + filename=$(basename $chrFasta .fa.gz) + num_bases=$(zgrep -v '^>' $chrFasta | tr -d '\\n' | wc -c) + echo "$filename\t$num_bases" >> {output.chrLen} + done + """ + +rule time_statistics: + input: + tar="logs/pan1c.pggb."+config['name']+".logs.tar.gz", + chrLen="logs/chrInputLen.tsv" + output: + tsv="output/pan1c.pggb."+config['name']+".time.stats.tsv", + dir=directory("output/pggb.time.figs") + params: + apppath=config['app.path'] + shell: + """ + mkdir -p {output.dir} + apptainer run {params.apppath}/pytools.sif python scripts/timeStats.py \ + -i {input.tar} -l {input.chrLen} -o {output.tsv} -f {output.dir} + """ \ No newline at end of file diff --git a/config.yaml b/config.yaml index 0ddab08..be94960 100644 --- a/config.yaml +++ b/config.yaml @@ -1,3 +1,4 @@ +name: '02H' reference: 'CHM13v2.0.fa.gz' app.path: '/home/amergez/work/apptainer' wfmash.params: '--approx-map' diff --git a/scripts/timeStats.py b/scripts/timeStats.py index 569f4b6..c7a41d6 100644 --- a/scripts/timeStats.py +++ b/scripts/timeStats.py @@ -15,6 +15,8 @@ import argparse import gzip import tarfile import os +import matplotlib.pyplot as plt +import seaborn as sns ## Arguments arg_parser = argparse.ArgumentParser(description='Pan1c time stats script') @@ -27,11 +29,11 @@ arg_parser.add_argument( help = "Tarballs containing time logs" ) arg_parser.add_argument( - "--fastadir", - "-f", - dest = "fastadir", + "--chrinputlen", + "-l", + dest = "chrinputlen", required = True, - help = "Directory containing fastas" + help = "TSV containing input sequence length (generated by Pan1c workflow)" ) arg_parser.add_argument( "--output", @@ -40,6 +42,13 @@ arg_parser.add_argument( required = True, help = "Output table (.tsv)" ) +arg_parser.add_argument( + "--figdir", + "-f", + dest = "figdir", + required = True, + help = "Output directory for figures" + ) arg_parser.add_argument( "--debug", "-d", @@ -51,6 +60,13 @@ args = arg_parser.parse_args() ## Toolbox +def getRegEquation(x, y): + (slope, intercept), ssr, _1, _2, _3 = np.polyfit(x, y, 1, full = True) + ymean = np.mean(y) + sst = np.sum((y - ymean)**2) + r2 = (1 - (ssr/sst))[0] + return f'y = {slope:.2f}x + {intercept:.2f} (R2 : {r2:.2f})' + ## Main script aggregatedData = {} @@ -75,7 +91,7 @@ for tarball in args.tarballs: if args.debug: print(f"[timeStats::debug] Current filename: {file.name}") # Getting Time, CPU and memory stats - time, cpu, memory = _tmpdf.iloc[4][1], int(_tmpdf.iloc[3][1][:-1]), int(_tmpdf.iloc[9][1])/100000 + time, cpu, memory = _tmpdf.iloc[4][1], int(_tmpdf.iloc[3][1][:-1]), int(_tmpdf.iloc[9][1])/((1024)**2) chrid = (file.name).split(".")[0] # Convert time to pd.timedelta (handling the 2 input types) @@ -89,31 +105,68 @@ for tarball in args.tarballs: # Adding to aggregatedData aggregatedData[chrid] = {"time": time, "cpu": cpu, "mem": memory} -# Reading fasta files to search for sequence length -chrSeqLength = {} -fastaList = [ - fasta for fasta in os.listdir(args.fastadir) - if os.path.isfile(os.path.join(args.fastadir, fasta)) - and fasta.split('.fa')[-1] == ".gz" - ] - -if args.debug: print(f"[timeStats::debug] Fasta list : {fastaList}") +# Reading chrInputLen to get number of bases used to construct chromosomes +chrSeqLength = pd.read_csv( + args.chrinputlen, + sep="\t", + header=None, + index_col=0 +).to_dict()[1] -for fasta in fastaList: - with gzip.open(os.path.join(args.fastadir, fasta), "r") as handle: - lineCounts = [len(line[:-1]) for line in handle.readlines() if line[0] != ">"] - - chrName = os.path.basename(fasta).split(".")[0] - chrSeqLength[chrName] = np.sum(lineCounts) - - if args.debug: print(f"[timeStats::debug] Input fasta - {chrName} : {chrSeqLength[chrName]} bases") - - +if args.debug: print("[timeStats::debug]", chrSeqLength) +# Adding base counts to aggregatedData +for chromName, chromLength in chrSeqLength.items(): + aggregatedData[chromName]["mbases"] = chromLength/1000000 - - - +# Creating a dataframe +df = pd.DataFrame(aggregatedData).transpose() +df.time = pd.to_timedelta(df.time) +df.mem = df.mem.astype(float) +df.cpu = df.cpu.astype(int) +df.mbases = df.mbases.astype(int) + +if args.debug: print("[timeStats::debug]", df.dtypes) + +# Saving the dataframe +df.to_csv(args.output, sep='\t') + +# Creating some figures +# Time versus base count +sns.regplot(x=df.mbases, y=df.time.dt.total_seconds(), line_kws={"color":"r","alpha":0.7,"lw":5}) +equation = getRegEquation(x=df.mbases, y=df.time.dt.total_seconds()) +plt.xlabel('Total input sequences length (Mb)') +plt.ylabel('Graph creation time (s)') +plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red') +plt.savefig(os.path.join(args.figdir,"TimeVSSeqLength.png")) +plt.close() + +# Memory versus base count +sns.regplot(x=df.mbases, y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5}) +equation = getRegEquation(x=df.mbases, y=df.mem) +plt.xlabel('Total input sequences length (Mb)') +plt.ylabel('Peak memory usage (GB)') +plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red') +plt.savefig(os.path.join(args.figdir,"MemoryVSSeqLength.png")) +plt.close() + +# CPU versus base count +sns.regplot(x=df.mbases, y=df.cpu, line_kws={"color":"r","alpha":0.7,"lw":5}) +equation = getRegEquation(x=df.mbases, y=df.cpu) +plt.xlabel('Total input sequences length (Mb)') +plt.ylabel('CPU usage (%)') +plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red') +plt.savefig(os.path.join(args.figdir,"CpuVSSeqLength.png")) +plt.close() + +# Memory versus CPU +sns.regplot(x=df.cpu, y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5}) +equation = getRegEquation(x=df.cpu, y=df.mem) +plt.xlabel('CPU usage (%)') +plt.ylabel('Peak memory usage (GB)') +plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red') +plt.savefig(os.path.join(args.figdir,"MemoryVSCpu.png")) +plt.close() -- GitLab From 4e7da79bdddfd37b46c3624d70c08646bb511e74 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Thu, 8 Feb 2024 15:18:45 +0100 Subject: [PATCH 05/11] Fixed dependance issue Fixed dependance issue with pggb_log_compression missing logs/pggb folder --- Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index 295a1e0..48cb11f 100644 --- a/Snakefile +++ b/Snakefile @@ -12,6 +12,7 @@ rule all: "output/pan1c.pggb."+config['name']+".chromGraph.stats.tsv", "output/figures", "output/pav.matrices", + "logs/pan1c.pggb."+config['name']+".logs.tar.gz", "output/pan1c.pggb."+config['name']+".time.stats.tsv" rule samtools_index: @@ -193,14 +194,13 @@ rule get_pav: rule pggb_log_compression: input: - dir="logs/pggb", flag="output/pan1c.pggb."+config['name']+".chromGraph.stats.tsv" output: tar="logs/pan1c.pggb."+config['name']+".logs.tar.gz", chrLen="logs/chrInputLen.tsv" shell: """ - cd {input.dir} + cd logs/pggb tar -zcvf ../../{output.tar} * cd ../.. -- GitLab From b67ee0169c38acccdf830e7cae8d12bcaeab637d Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 9 Feb 2024 10:23:39 +0100 Subject: [PATCH 06/11] Renaming and bux fixing Renamed timeStats.py to usageStats.py. Updated snakefile accordingly. Fixed bug in snakefile. Updated README --- README.md | 4 ++-- Snakefile | 12 +++++++----- scripts/{timeStats.py => usageStats.py} | 7 +++---- 3 files changed, 12 insertions(+), 11 deletions(-) rename scripts/{timeStats.py => usageStats.py} (96%) diff --git a/README.md b/README.md index a63b175..24feecf 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,8 @@ Pan1c/ ``` # Prepare your data This workflow can take chromosome level assemblies as well as contig level assemblies. -**Fasta files need to be compressed** using bgzip2 (included in [PanGeTools](https://forgemia.inra.fr/alexis.mergez/pangetools)). -Records id **must follow this pattern** : `<haplotype name>#<ctg|chr name>`. (`CHM13#chr01` for example). +**Fasta files need to be compressed** using **bgzip2** (included in [PanGeTools](https://forgemia.inra.fr/alexis.mergez/pangetools)). +Records id **must follow this pattern** : `<haplotype name>#<ctg|chr name>`. (`CHM13#chr01` for example where the fasata file is named CHM13.fa.gz). Because of the clustering step, this pattern is only needed for the reference assembly. Input files should be read only to prevent snakemake to mess with them (which seems to happen in some rare cases). diff --git a/Snakefile b/Snakefile index 48cb11f..bf95064 100644 --- a/Snakefile +++ b/Snakefile @@ -172,10 +172,12 @@ rule aggregate_stats: "output/stats/" output: "output/pan1c.pggb."+config['name']+".chromGraph.stats.tsv" + params: + apppath=config['app.path'] shell: """ apptainer run {params.apppath}/pytools.sif python scripts/statsAggregation.py \ - --input {input} --output {output}" + --input {input} --output {output} """ rule get_pav: @@ -215,18 +217,18 @@ rule pggb_log_compression: done """ -rule time_statistics: +rule ressources_statistics: input: tar="logs/pan1c.pggb."+config['name']+".logs.tar.gz", chrLen="logs/chrInputLen.tsv" output: - tsv="output/pan1c.pggb."+config['name']+".time.stats.tsv", - dir=directory("output/pggb.time.figs") + tsv="output/pan1c.pggb."+config['name']+".usage.stats.tsv", + dir=directory("output/pggb.usage.figs") params: apppath=config['app.path'] shell: """ mkdir -p {output.dir} - apptainer run {params.apppath}/pytools.sif python scripts/timeStats.py \ + apptainer run {params.apppath}/pytools.sif python scripts/usageStats.py \ -i {input.tar} -l {input.chrLen} -o {output.tsv} -f {output.dir} """ \ No newline at end of file diff --git a/scripts/timeStats.py b/scripts/usageStats.py similarity index 96% rename from scripts/timeStats.py rename to scripts/usageStats.py index c7a41d6..d04d203 100644 --- a/scripts/timeStats.py +++ b/scripts/usageStats.py @@ -1,8 +1,8 @@ """ -Time stats script for Pan1c workflow +Usage statistics script for Pan1c workflow Given one or more tarball containing time logs of the pggb rule of the workflow, -the script returns an aggregated table containing descriptive stats +the script returns an aggregated table as well as figures @author: alexis.mergez@inrae.fr @version: 1.0 @@ -19,7 +19,7 @@ import matplotlib.pyplot as plt import seaborn as sns ## Arguments -arg_parser = argparse.ArgumentParser(description='Pan1c time stats script') +arg_parser = argparse.ArgumentParser(description='Pan1c usage stats script') arg_parser.add_argument( "--input", "-i", @@ -59,7 +59,6 @@ arg_parser.add_argument( args = arg_parser.parse_args() ## Toolbox - def getRegEquation(x, y): (slope, intercept), ssr, _1, _2, _3 = np.polyfit(x, y, 1, full = True) ymean = np.mean(y) -- GitLab From 5d6be0d2366c011797f9a4ec4277fef7339559f2 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 9 Feb 2024 11:24:56 +0100 Subject: [PATCH 07/11] Update Snakefile Added dynamic threading for get_pav rule --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index bf95064..32a6f4f 100644 --- a/Snakefile +++ b/Snakefile @@ -186,7 +186,7 @@ rule get_pav: availGraphs output: directory("output/pav.matrices") - threads: 1 + threads: 0.25 * workflow.cores params: apppath=config['app.path'] run: -- GitLab From 80edfa2a05a1df2a23bf758c3e2d30d255c6482b Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 9 Feb 2024 11:41:41 +0100 Subject: [PATCH 08/11] Switched to snakebox Switched to snakebox and its included snakemake. See : https://forgemia.inra.fr/alexis.mergez/pangratools Updated README accordingly --- README.md | 1 + runSnakemake.sh | 11 ++++------- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 24feecf..61656e4 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ Before running the worflow, some apptainer images needs to be downloaded : apptainer build <your_apptainer_image_directory>/PanGeTools.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/pangetools:latest apptainer build <your_apptainer_image_directory>/pytools.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/pytools:latest apptainer build <your_apptainer_image_directory>/pggb.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangratools/pggb:latest +apptainer build <your_apptainer_image_directory>/snakebox.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangratools/snakebox:latest ``` # Usage diff --git a/runSnakemake.sh b/runSnakemake.sh index 84f9786..1df4d4c 100755 --- a/runSnakemake.sh +++ b/runSnakemake.sh @@ -1,17 +1,14 @@ #!/bin/bash #SBATCH -p unlimitq #SBATCH --cpus-per-task=32 -#SBATCH -o <log path>/pggb.1chr.<id>.log -#SBATCH -J pggb.1chr.<id> +#SBATCH -o <log path>.log +#SBATCH -J <jname> #SBATCH --mem=1T #SBATCH --mail-type=BEGIN,END,FAIL #SBATCH --mail-user=<mail> module purge module load containers/Apptainer/1.2.5 -module load devel/Miniconda/Miniconda3 -source ~/.bashrc -mamba activate snakemake - -snakemake -c $(nproc) \ No newline at end of file +/usr/bin/time -v -o <whole log file>.log \ + 'apptainer run <snakemakepath>.sif snakemake -c $(nproc)' \ No newline at end of file -- GitLab From 6f2c328edbfafad7c60a42fb5f42ff054e0fc7fc Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 9 Feb 2024 13:56:23 +0100 Subject: [PATCH 09/11] Update getPanachePAV.sh Added -P to odgi commands to get progression logs. --- scripts/getPanachePAV.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/getPanachePAV.sh b/scripts/getPanachePAV.sh index 6584e87..719b4cf 100644 --- a/scripts/getPanachePAV.sh +++ b/scripts/getPanachePAV.sh @@ -29,12 +29,12 @@ chrname=$(basename ${gfa} .gfa) apptainer run --app odgi "${appdir}/PanGeTools.sif" paths -i ${gfa} -L > ${chrdir}/$chrname.paths.txt # Creating bed using odgi untangle -apptainer run --app odgi "${appdir}/PanGeTools.sif" untangle -i ${gfa} -R ${chrdir}/$chrname.paths.txt -t $threads | sed '1d' | \ +apptainer run --app odgi "${appdir}/PanGeTools.sif" untangle -i ${gfa} -R ${chrdir}/$chrname.paths.txt -t $threads -P | sed '1d' | \ cut -f 4,5,6 | sort | uniq > ${chrdir}/$chrname.untangle.multiref.bed # Running odgi pav apptainer run --app odgi "${appdir}/PanGeTools.sif" pav -i ${gfa} -b ${chrdir}/$chrname.untangle.multiref.bed -M \ - -B 0.5 -t $threads > ${chrdir}/$chrname.untangle.multiref.pavs.matrix.tsv + -B 0.5 -t $threads -P > ${chrdir}/$chrname.untangle.multiref.pavs.matrix.tsv # Adding empty column for Panache awk -F'\t' 'BEGIN {OFS="\t"} NR==1 {print $1, $2, $3, $4, "temp", "temp", $5, $6, $7} NR>1 {print $1, $2, $3, $4, ".", ".", $5, $6, $7}' \ -- GitLab From 629eda886ae6b5651f9c91513893f77d75f759fc Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 9 Feb 2024 14:42:09 +0100 Subject: [PATCH 10/11] Update runSnakemake.sh Back to old scripts. apptainer is not available when snakemake is launched from apptainer image --- runSnakemake.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/runSnakemake.sh b/runSnakemake.sh index 1df4d4c..6f7c9df 100755 --- a/runSnakemake.sh +++ b/runSnakemake.sh @@ -9,6 +9,9 @@ module purge module load containers/Apptainer/1.2.5 +module load devel/Miniconda/Miniconda3 -/usr/bin/time -v -o <whole log file>.log \ - 'apptainer run <snakemakepath>.sif snakemake -c $(nproc)' \ No newline at end of file +source ~/.bashrc +mamba activate snakemake + +snakemake -c $(nproc) -- GitLab From de9b8c193c9625df0d809c2053b25f30a58e4037 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 9 Feb 2024 14:43:24 +0100 Subject: [PATCH 11/11] Update Snakefile Changed workflow to use graphList.txt instead of recomputing the list of ouput from clustering checkpoint. Added .og graph generation in rule pggb_on_chr --- Snakefile | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/Snakefile b/Snakefile index 32a6f4f..c0e1e7b 100644 --- a/Snakefile +++ b/Snakefile @@ -77,7 +77,8 @@ rule pggb_on_chr: fa="data/chrInputs/{chromosome}.fa.gz", gzi="data/chrInputs/{chromosome}.fa.gz.gzi" output: - "data/chrGraphs/{chromosome}.gfa" + gfa="data/chrGraphs/{chromosome}.gfa" + og="data/chrGraphs/{chromosome}.gfa" threads: 16 params: pggb=config['pggb.params'], @@ -92,7 +93,9 @@ rule pggb_on_chr: -i {input.fa} {params.pggb} -n {nHAP} -t {threads} -T {threads} -o data/chrGraphs/{wildcards.chromosome} 2>&1 | \ tee {log.cmd} echo $(ls data/chrGraphs/{wildcards.chromosome}) - mv data/chrGraphs/{wildcards.chromosome}/*.gfa {output} + mv data/chrGraphs/{wildcards.chromosome}/*.gfa {output.gfa} + apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + build -s -O -t {threads} -P -g {output.gfa} -o {output.og} """ def availGraphs(wildcards): @@ -138,7 +141,7 @@ rule graph_squeeze: rule graph_stats: # Using odgi to produce stats on every chromosome scale graph input: - availGraphs + "data/chrGraphs/graphsList.txt" output: directory("output/stats") threads: 4 @@ -146,13 +149,17 @@ rule graph_stats: apppath=config['app.path'] run: shell("mkdir {output}") - for graph in input: + # Getting the list of graphs + with open(input[0]) as handle: + graphList = [graph.rstrip("\n") for graph in handle.readlines()] + # Iterating over graphs + for graph in graphList: shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif stats -S -t {threads} -P -i {graph} > {output}/$(basename {graph} .gfa).stats.tsv") rule graph_figs: # Creating figures using odgi viz input: - availGraphs + "data/chrGraphs/graphsList.txt" output: directory("output/figures") threads: 4 @@ -162,9 +169,13 @@ rule graph_figs: pcov=config['odgi.pcov.params'] run: shell("mkdir {output}") - for graph in input: - shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif viz -i {graph} -o {output}/$(basename {graph} .gfa).1Dviz.png {params.oneDviz}") - shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif viz -i {graph} -o {output}/$(basename {graph} .gfa).pcov.png {params.pcov}") + # Getting the list of graphs + with open(input[0]) as handle: + graphList = [graph.rstrip("\n") for graph in handle.readlines()] + # Iterating over graphs + for graph in graphList: + shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif viz -i {graph} -o {output}/$(basename {graph} .gfa).1Dviz.png {params.oneDviz} -t {threads} -P") + shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif viz -i {graph} -o {output}/$(basename {graph} .gfa).pcov.png {params.pcov} -t {threads} -P") rule aggregate_stats: # Reading and merging all stats files into a final one called aggregatedStats.tsv @@ -183,7 +194,7 @@ rule aggregate_stats: rule get_pav: # Create PAV matrix readable by panache for a given chromosome scale graph input: - availGraphs + "data/chrGraphs/graphsList.txt" output: directory("output/pav.matrices") threads: 0.25 * workflow.cores @@ -191,7 +202,11 @@ rule get_pav: apppath=config['app.path'] run: shell("mkdir {output}") - for graph in input: + # Getting the list of graphs + with open(input[0]) as handle: + graphList = [graph.rstrip("\n") for graph in handle.readlines()] + # Iterating over graphs + for graph in graphList: shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.apppath} -t {threads}") rule pggb_log_compression: -- GitLab