From a7794e7da94ac8967095dbc79743a7c6b1760160 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 13 May 2024 10:42:01 +0200 Subject: [PATCH 01/30] Started gaf2aln dev --- gaf2aln.py | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 gaf2aln.py diff --git a/gaf2aln.py b/gaf2aln.py new file mode 100644 index 0000000..fea7d30 --- /dev/null +++ b/gaf2aln.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +gaf2aln +Convert gaf alignement to sam or paf + +@author: alexis.mergez@inrae.fr +@version: 0.1 +""" + +import numpy as np +import pandas as pd +import argparse +import os + +version = "0.1" + +## Argument parser +arg_parser = argparse.ArgumentParser(description='GFAvc: GFA version converter') +arg_parser.add_argument( + "--gfa", + "-g", + dest = "gfa", + required = True, + help = "Graph (.gfa v1)" + ) +arg_parser.add_argument( + "--gaf", + "-a", + dest = "gaf", + required = True, + help = "Alignement file (.gaf)" + ) +arg_parser.add_argument( + "--format", + "-f", + dest = "format", + default = "P", + help = "Output file format. (S: sam, P: paf (default))" + ) +arg_parser.add_argument( + '--version', + '-v', + action="store_true", + dest = "version", + help = "Show version" +) +args = arg_parser.parse_args() + +# Printing version +if args.version: + print(version) + os._exit(0) + +# Parsing the .gaf file +with open(args.gaf, 'r') as file: + gaf_lines = file.readlines() + +gaf_col = [ + "QRY.NAME", "QRY.LEN", "QRY.START", "QRY.END", "STRAND", + "PATH.MATCH", "PATH.LEN", "ALN.START", "ALN.END", + "RES.MATCH", "ALN.BLOCK.LEN", "MAPPING.QUAL" + ] + +# Creating dictionnary to store alignments +aln_dict = {} +for line in range(len(gaf_lines)): + ## Splitting the line by tabulation + line_content = gaf_lines[line][:-1].split('\t') + + ## Adding alignement info to dictionnary + aln_dict[f"ALN_{line+1}"] = { + gaf_col[i]: line_content[i] for i in range(len(gaf_col)) + } + + ## Splitting "PATH.MATCH" into a list + aln_dict[f"ALN_{line+1}"]["PATH.MATCH"] = [ + int(node_id) for node_id in aln_dict[f"ALN_{line+1}"]["PATH.MATCH"].split(">")[1:] + ] + + ## Adding tags + aln_dict[f"ALN_{line+1}"]["TAGS"] = ",".join(line_content[13:]) + +# Getting nodes of interest ids + +# Debug +print(aln_dict) + + + +# Parsing the .gaf +with open(args.gfa, 'r') as file: + gfa_lines = file.readlines() + -- GitLab From 0658a3f77518ff5b4c49b461a04d910a3aab673d Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Mon, 13 May 2024 12:44:32 +0200 Subject: [PATCH 02/30] Update gaf2aln.py --- gaf2aln.py | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/gaf2aln.py b/gaf2aln.py index fea7d30..ced7e7f 100644 --- a/gaf2aln.py +++ b/gaf2aln.py @@ -82,13 +82,75 @@ for line in range(len(gaf_lines)): aln_dict[f"ALN_{line+1}"]["TAGS"] = ",".join(line_content[13:]) # Getting nodes of interest ids +aln_nodes = np.unique([ + node_id + for aln in aln_dict.keys() + for node_id in aln_dict[aln]["PATH.MATCH"] +]) +print(aln_nodes) # Debug print(aln_dict) - - -# Parsing the .gaf +# Parsing the .gfa with open(args.gfa, 'r') as file: gfa_lines = file.readlines() +# Nodes length dictionnary structured as follow : +# {<NODE.ID>: <NODE.LENGTH>} +nodes_length = {} +# Nodes dictionnary structured as follow : +# {<ALN.NODE.ID> : {PATHS: {<PATH.NAME>: (start, end)}}} +nodes = {} +# Paths dictionnary structured as follow : +# {<PATH.NAME>: {NODES: {NODE.ID: <NODE.ID>, ORIENT: <NODE.ORIENT>}, CIGAR: <CIGAR in comma separated list>} +paths = {} +# Links dictionnary structured as follow : +# {<FROM.NODE.ID>: {<TO.NODE.ID>: {FROM.ORIENT: <FROM.ORIENT>, TO.ORIENT: <TO.ORIENT>}}} +links = {} + +for line in gfa_lines: + line_content = line[:-1].split("\t") + line_id = line_content[0] + + # Segment line + if line_id == "S" : + + nodes_length[str(line_content[1])] = len(line_content[2]) + + if line_content[1] in aln_nodes: + nodes[str(line_content[1])] = {} + + # Link line + elif line_id == "L": + try : + links[str(line_content[1])][str(line_content[3])] = { + "FROM": str(line_content[2]), + "TO": str(line_content[4]) + } + + except : + links[str(line_content[1])] = { + [str(line_content[3])] : {"FROM.ORIENT": str(line_content[2]), "TO.ORIENT": str(line_content[4])} + } + + # Path line + elif line_id == "P": + paths[str(line_content[1])] = { + "NODES": [ + {"NODE.ID": str(node_id[:-1]), "ORIENT": str(node_id[-1])} + for node_id in line_content[2].split(',') + ], + "CIGAR": line_content[3] + } + +# Getting the start and end position of alignment nodes on each paths +def getPathPos(node_id, nodes=nodes, nodes_length=nodes_length, links=links, paths=paths) + +for node_id in nodes.keys(): + for path_id in paths.keys(): + if + + + + -- GitLab From 81bae737dd5a53d0bf976f78dcd532cb2db1a439 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Tue, 14 May 2024 10:19:15 +0200 Subject: [PATCH 03/30] Update gaf2aln.py --- gaf2aln.py | 141 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 107 insertions(+), 34 deletions(-) diff --git a/gaf2aln.py b/gaf2aln.py index ced7e7f..823a51d 100644 --- a/gaf2aln.py +++ b/gaf2aln.py @@ -11,6 +11,7 @@ Convert gaf alignement to sam or paf import numpy as np import pandas as pd import argparse +import concurrent.futures import os version = "0.1" @@ -38,6 +39,15 @@ arg_parser.add_argument( default = "P", help = "Output file format. (S: sam, P: paf (default))" ) +arg_parser.add_argument( + "--threads", + "-t", + dest = "threads", + required = False, + default = 1, + type = int, + help = "Number of threads" + ) arg_parser.add_argument( '--version', '-v', @@ -103,53 +113,116 @@ nodes_length = {} # {<ALN.NODE.ID> : {PATHS: {<PATH.NAME>: (start, end)}}} nodes = {} # Paths dictionnary structured as follow : -# {<PATH.NAME>: {NODES: {NODE.ID: <NODE.ID>, ORIENT: <NODE.ORIENT>}, CIGAR: <CIGAR in comma separated list>} +# {<PATH.NAME>: {NODES: {<NODE.ID>: <NODE.ORIENT>}, CIGAR: <CIGAR in comma separated list>} paths = {} # Links dictionnary structured as follow : # {<FROM.NODE.ID>: {<TO.NODE.ID>: {FROM.ORIENT: <FROM.ORIENT>, TO.ORIENT: <TO.ORIENT>}}} links = {} -for line in gfa_lines: - line_content = line[:-1].split("\t") - line_id = line_content[0] - - # Segment line - if line_id == "S" : +# Parsing the gfa +## Multithreading function +def parse_gfa_line(gfa_lines, aln_nodes=aln_nodes): + _nodes, _nodes_length, _links, _paths = {}, {}, {}, {} + for line in gfa_lines: + line_content = line[:-1].split("\t") + line_id = line_content[0] - nodes_length[str(line_content[1])] = len(line_content[2]) + # Segment line + if line_id == "S" : + + _nodes_length[str(line_content[1])] = len(line_content[2]) - if line_content[1] in aln_nodes: - nodes[str(line_content[1])] = {} - - # Link line - elif line_id == "L": - try : - links[str(line_content[1])][str(line_content[3])] = { - "FROM": str(line_content[2]), - "TO": str(line_content[4]) + if line_content[1] in aln_nodes: + _nodes[str(line_content[1])] = {"PATHS": {}} + + # Link line + elif line_id == "L": + try : + _links[str(line_content[1])][str(line_content[3])] = { + "FROM": str(line_content[2]), + "TO": str(line_content[4]) + } + + except : + _links[str(line_content[1])] = { + str(line_content[3]) : {"FROM.ORIENT": str(line_content[2]), "TO.ORIENT": str(line_content[4])} + } + + # Path line + elif line_id == "P": + _paths[str(line_content[1])] = { + "NODES": { + str(node_id[:-1]): str(node_id[-1]) + for node_id in line_content[2].split(',') + }, + "CIGAR": line_content[3] } - except : - links[str(line_content[1])] = { - [str(line_content[3])] : {"FROM.ORIENT": str(line_content[2]), "TO.ORIENT": str(line_content[4])} - } + return [_nodes, _nodes_length, _links, _paths] + +## Parsing subsets +executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.threads) +res = {} +quantiles = np.array(range(args.threads+1))/args.threads +splits = np.quantile(range(len(gfa_lines)), quantiles, method="higher").tolist() +splits[-1] += 1 +for i in range(len(splits)-1): + res[i] = executor.submit(parse_gfa_line, gfa_lines[splits[i]:splits[i+1]]) +executor.shutdown(wait=True) + +## Aggregating results +for _res in res.values(): + _nodes, _nodes_length, _links, _paths = _res.result() + + for key, value in _nodes.items(): + nodes[key] = value + + for key, value in _nodes_length.items(): + nodes_length[key] = value - # Path line - elif line_id == "P": - paths[str(line_content[1])] = { - "NODES": [ - {"NODE.ID": str(node_id[:-1]), "ORIENT": str(node_id[-1])} - for node_id in line_content[2].split(',') - ], - "CIGAR": line_content[3] - } + for key, value in _links.items(): + if not key in links.keys(): + links[key] = value + else : + for key_sub, value_sub in value.items(): + links[key][key_sub] = value_sub + + for key, value in _paths.items(): + paths[key] = value # Getting the start and end position of alignment nodes on each paths -def getPathPos(node_id, nodes=nodes, nodes_length=nodes_length, links=links, paths=paths) +def getPathPos(path_id, nodes=nodes, nodes_length=nodes_length, links=links, paths=paths): + cur_pos = 0 + _dict = nodes.copy() + for path_node in paths[path_id]["NODES"].keys(): + try : + _dict[path_node]["PATHS"][path_id] = (cur_pos, cur_pos+nodes_length[path_node]) + cur_pos += nodes_length[path_node] + except : + cur_pos += nodes_length[path_node] + return _dict + +# Collecting positions of nodes within paths +## Searching in each path +executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.threads) +res = {} +for path_name in paths.keys(): + print(f"Running on {path_name}") + res[path_name] = executor.submit(getPathPos, path_name) +executor.shutdown(wait=True) + +## Storing results +for path in res.keys(): + _dict = res[path].result() + for node_id, path_dict in _dict.items(): + print(node_id, path_dict) + for path_name, coordinates in path_dict["PATHS"].items(): + nodes[node_id]["PATHS"][path_name] = coordinates + +print(nodes) + + -for node_id in nodes.keys(): - for path_id in paths.keys(): - if -- GitLab From 0f682ffdf8b00ee009ac89e8e2d8057d56eeb785 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Tue, 14 May 2024 14:57:45 +0200 Subject: [PATCH 04/30] Update gaf2aln.py --- gaf2aln.py | 166 +++++++++++++++++++++++------------------------------ 1 file changed, 72 insertions(+), 94 deletions(-) diff --git a/gaf2aln.py b/gaf2aln.py index 823a51d..a684dc8 100644 --- a/gaf2aln.py +++ b/gaf2aln.py @@ -63,6 +63,7 @@ if args.version: os._exit(0) # Parsing the .gaf file +print(f"[gaf2aln::GAF Parser] Reading {args.gaf} ...") with open(args.gaf, 'r') as file: gaf_lines = file.readlines() @@ -73,6 +74,7 @@ gaf_col = [ ] # Creating dictionnary to store alignments +print(f"[gaf2aln::GAF Parser] Extracting alignments ...") aln_dict = {} for line in range(len(gaf_lines)): ## Splitting the line by tabulation @@ -85,7 +87,7 @@ for line in range(len(gaf_lines)): ## Splitting "PATH.MATCH" into a list aln_dict[f"ALN_{line+1}"]["PATH.MATCH"] = [ - int(node_id) for node_id in aln_dict[f"ALN_{line+1}"]["PATH.MATCH"].split(">")[1:] + str(node_id) for node_id in aln_dict[f"ALN_{line+1}"]["PATH.MATCH"].split(">")[1:] ] ## Adding tags @@ -93,16 +95,15 @@ for line in range(len(gaf_lines)): # Getting nodes of interest ids aln_nodes = np.unique([ - node_id + str(node_id) for aln in aln_dict.keys() for node_id in aln_dict[aln]["PATH.MATCH"] -]) -print(aln_nodes) +]).tolist() -# Debug -print(aln_dict) +del gaf_lines, gaf_col # Parsing the .gfa +print(f"[gaf2aln::GFA Parser] Reading {args.gfa} ...") with open(args.gfa, 'r') as file: gfa_lines = file.readlines() @@ -111,7 +112,9 @@ with open(args.gfa, 'r') as file: nodes_length = {} # Nodes dictionnary structured as follow : # {<ALN.NODE.ID> : {PATHS: {<PATH.NAME>: (start, end)}}} -nodes = {} +nodes = { + node_id: {"PATHS": {}} for node_id in aln_nodes +} # Paths dictionnary structured as follow : # {<PATH.NAME>: {NODES: {<NODE.ID>: <NODE.ORIENT>}, CIGAR: <CIGAR in comma separated list>} paths = {} @@ -120,106 +123,81 @@ paths = {} links = {} # Parsing the gfa -## Multithreading function -def parse_gfa_line(gfa_lines, aln_nodes=aln_nodes): - _nodes, _nodes_length, _links, _paths = {}, {}, {}, {} - for line in gfa_lines: - line_content = line[:-1].split("\t") - line_id = line_content[0] - - # Segment line - if line_id == "S" : - - _nodes_length[str(line_content[1])] = len(line_content[2]) - - if line_content[1] in aln_nodes: - _nodes[str(line_content[1])] = {"PATHS": {}} +print(f"[gaf2aln::GFA Parser] Extracting nodes, paths and links ...") +for line in gfa_lines: + line_content = line[:-1].split("\t") + line_id = line_content[0] + + # Segment line + if line_id == "S" : - # Link line - elif line_id == "L": - try : - _links[str(line_content[1])][str(line_content[3])] = { - "FROM": str(line_content[2]), - "TO": str(line_content[4]) - } - - except : - _links[str(line_content[1])] = { - str(line_content[3]) : {"FROM.ORIENT": str(line_content[2]), "TO.ORIENT": str(line_content[4])} - } - - # Path line - elif line_id == "P": - _paths[str(line_content[1])] = { - "NODES": { - str(node_id[:-1]): str(node_id[-1]) - for node_id in line_content[2].split(',') - }, - "CIGAR": line_content[3] + nodes_length[str(line_content[1])] = len(line_content[2]) + + # Link line + elif line_id == "L": + try : + links[str(line_content[1])][str(line_content[3])] = { + "FROM": str(line_content[2]), + "TO": str(line_content[4]) } - return [_nodes, _nodes_length, _links, _paths] - -## Parsing subsets -executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.threads) -res = {} -quantiles = np.array(range(args.threads+1))/args.threads -splits = np.quantile(range(len(gfa_lines)), quantiles, method="higher").tolist() -splits[-1] += 1 -for i in range(len(splits)-1): - res[i] = executor.submit(parse_gfa_line, gfa_lines[splits[i]:splits[i+1]]) -executor.shutdown(wait=True) - -## Aggregating results -for _res in res.values(): - _nodes, _nodes_length, _links, _paths = _res.result() - - for key, value in _nodes.items(): - nodes[key] = value - - for key, value in _nodes_length.items(): - nodes_length[key] = value + except : + links[str(line_content[1])] = { + str(line_content[3]) : {"FROM.ORIENT": str(line_content[2]), "TO.ORIENT": str(line_content[4])} + } - for key, value in _links.items(): - if not key in links.keys(): - links[key] = value - else : - for key_sub, value_sub in value.items(): - links[key][key_sub] = value_sub + # Path line + elif line_id == "P": + paths[str(line_content[1])] = { + "NODES": { + str(node_id[:-1]): str(node_id[-1]) + for node_id in line_content[2].split(',') + }, + "CIGAR": line_content[3] + } - for key, value in _paths.items(): - paths[key] = value +del gfa_lines # Getting the start and end position of alignment nodes on each paths -def getPathPos(path_id, nodes=nodes, nodes_length=nodes_length, links=links, paths=paths): +print(f"[gaf2aln::Graph processing] Computing nodes positions ...") +for path_name in paths.keys(): + print(f"[gaf2aln::Graph processing] Running on {path_name} ...") cur_pos = 0 - _dict = nodes.copy() - for path_node in paths[path_id]["NODES"].keys(): + for path_node in paths[path_name]["NODES"].keys(): try : - _dict[path_node]["PATHS"][path_id] = (cur_pos, cur_pos+nodes_length[path_node]) + nodes[path_node]["PATHS"][path_name] = (cur_pos, cur_pos+nodes_length[path_node]) cur_pos += nodes_length[path_node] except : cur_pos += nodes_length[path_node] - return _dict -# Collecting positions of nodes within paths -## Searching in each path -executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.threads) -res = {} -for path_name in paths.keys(): - print(f"Running on {path_name}") - res[path_name] = executor.submit(getPathPos, path_name) -executor.shutdown(wait=True) - -## Storing results -for path in res.keys(): - _dict = res[path].result() - for node_id, path_dict in _dict.items(): - print(node_id, path_dict) - for path_name, coordinates in path_dict["PATHS"].items(): - nodes[node_id]["PATHS"][path_name] = coordinates - -print(nodes) +# Reconstructing alignments for each path +print(f"[gaf2aln::Alignment processing] Computing alignments ...") +for aln_name in aln_dict.keys(): + + print(f"[gaf2aln::Alignment processing] Looking into alignment {aln_name} ...") + for path_name in paths.keys(): + + print(f"[gaf2aln::Alignment processing] Running on {path_name} ...") + cur_pos, cur_aln = 0, [] + + # Traversing alignment path + for node_id in aln_dict[aln_name]["PATH.MATCH"]: + + # Checking if node is traversed by the current path + if path_name in nodes[node_id]["PATHS"].keys(): + try : + cur_aln[-1] += [node_id] + except : + cur_aln.append([node_id]) + + else : + # Checking for emptyness + if not len(cur_aln) or not len(cur_aln[-1]): + cur_aln.append([]) + else : + + + # Ajouter le noeud au segment contigue ou finir le dernier segment le cas échéant -- GitLab From 15e32c6bda5f4b8a99a24c778da233c3ba98e12c Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Tue, 14 May 2024 16:30:50 +0200 Subject: [PATCH 05/30] Update gaf2aln.py --- gaf2aln.py | 74 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 18 deletions(-) diff --git a/gaf2aln.py b/gaf2aln.py index a684dc8..cd73cd4 100644 --- a/gaf2aln.py +++ b/gaf2aln.py @@ -62,6 +62,15 @@ if args.version: print(version) os._exit(0) +# Toolbox +def walk2path(walk): + """ + Takes a walk in a single string and returns a list of nodes id with signs (gfa v1 like) + """ + _ = re.findall(r'>\w+|<\w+', walk) + # Converting ['>..', '>..', '<..', '>..'] to '..+,..+,..-,..+' + return [f'{elem[1:]}{(elem[0] == ">")*"+"+(elem[0] == "<")*"-"}' for elem in _] + # Parsing the .gaf file print(f"[gaf2aln::GAF Parser] Reading {args.gaf} ...") with open(args.gaf, 'r') as file: @@ -84,11 +93,12 @@ for line in range(len(gaf_lines)): aln_dict[f"ALN_{line+1}"] = { gaf_col[i]: line_content[i] for i in range(len(gaf_col)) } - + ## Splitting "PATH.MATCH" into a list - aln_dict[f"ALN_{line+1}"]["PATH.MATCH"] = [ - str(node_id) for node_id in aln_dict[f"ALN_{line+1}"]["PATH.MATCH"].split(">")[1:] - ] + aln_dict[f"ALN_{line+1}"]["PATH.MATCH"] = { + str(node_id[:-1]): node_id[-1] + for node_id in walk2path(aln_dict[f"ALN_{line+1}"]["PATH.MATCH"]) + } ## Adding tags aln_dict[f"ALN_{line+1}"]["TAGS"] = ",".join(line_content[13:]) @@ -97,7 +107,7 @@ for line in range(len(gaf_lines)): aln_nodes = np.unique([ str(node_id) for aln in aln_dict.keys() - for node_id in aln_dict[aln]["PATH.MATCH"] + for node_id in aln_dict[aln]["PATH.MATCH"].keys() ]).tolist() del gaf_lines, gaf_col @@ -170,34 +180,62 @@ for path_name in paths.keys(): except : cur_pos += nodes_length[path_node] +final_aln = [] + # Reconstructing alignments for each path print(f"[gaf2aln::Alignment processing] Computing alignments ...") for aln_name in aln_dict.keys(): print(f"[gaf2aln::Alignment processing] Looking into alignment {aln_name} ...") + aln_dict[aln_name]["HAP.MATCH"] = {} + for path_name in paths.keys(): print(f"[gaf2aln::Alignment processing] Running on {path_name} ...") - cur_pos, cur_aln = 0, [] - + cur_aln = [] + in_aln = False # Keeping track of if we are in an alignment + # Traversing alignment path - for node_id in aln_dict[aln_name]["PATH.MATCH"]: + for node_id, strand_on_aln in aln_dict[aln_name]["PATH.MATCH"].items(): + strand_on_path = paths[path_name]["NODES"][node_id] + + # Checking strand + if strand_on_aln == strand_on_path : + strand = "S" + else : + strand = "I" # Checking if node is traversed by the current path if path_name in nodes[node_id]["PATHS"].keys(): - try : - cur_aln[-1] += [node_id] - except : - cur_aln.append([node_id]) + if not in_aln: + cur_aln.append({ + "IN": True, + "NODES": { + node_id: strand + } + }) + in_aln = True + + else : cur_aln[-1]["NODES"][node_id] = strand else : - # Checking for emptyness - if not len(cur_aln) or not len(cur_aln[-1]): - cur_aln.append([]) - else : - + if in_aln or not len(cur_aln): + cur_aln.append({ + "IN": False, + "NODES": { + node_id: strand + } + }) + in_aln = False + + else : cur_aln[-1]["NODES"][node_id] = strand + + aln_dict[aln_name]["HAP.MATCH"][path_name] = cur_aln.copy() + print(f"{path_name} :", cur_aln) + + # Traversing the HAP.MATCH to get alignment + for - # Ajouter le noeud au segment contigue ou finir le dernier segment le cas échéant -- GitLab From 146d897c7818d60499b5611294ecdc0d2bcb80e5 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 17 May 2024 10:31:52 +0200 Subject: [PATCH 06/30] Update gaf2aln.py --- gaf2aln.py | 175 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 121 insertions(+), 54 deletions(-) diff --git a/gaf2aln.py b/gaf2aln.py index cd73cd4..afc61e2 100644 --- a/gaf2aln.py +++ b/gaf2aln.py @@ -13,6 +13,7 @@ import pandas as pd import argparse import concurrent.futures import os +import re version = "0.1" @@ -71,6 +72,35 @@ def walk2path(walk): # Converting ['>..', '>..', '<..', '>..'] to '..+,..+,..-,..+' return [f'{elem[1:]}{(elem[0] == ">")*"+"+(elem[0] == "<")*"-"}' for elem in _] +def cigar2basealn(cigar): + """ + Takes a CIGAR string and convert it into a list of base level alignment. + For example : "345=" -> ["=", "=", ..., "="] of length 345. + """ + _ = re.findall(r'\d+\D', cigar) + print(_) + final_cigar = [] + for match in _: + final_cigar += [match[-1]]*int(match[:-1]) + + print(final_cigar) + return final_cigar + +def basealn2cigar(base_aln_list): + + last_elem = base_aln_list[0] + CIGAR = [[1, last_elem]] + for elem in base_aln_list[1:]: + if elem == last_elem: + CIGAR[-1][0] += 1 + + else : + CIGAR[-1][0] = str(CIGAR[-1][0]) + CIGAR.append([1, elem]) + last_elem = elem + CIGAR[-1][0] = str(CIGAR[-1][0]) + return "".join(["".join(block) for block in CIGAR if block[1] != ""]) + # Parsing the .gaf file print(f"[gaf2aln::GAF Parser] Reading {args.gaf} ...") with open(args.gaf, 'r') as file: @@ -100,8 +130,11 @@ for line in range(len(gaf_lines)): for node_id in walk2path(aln_dict[f"ALN_{line+1}"]["PATH.MATCH"]) } + ## Adding CIGAR + aln_dict[f"ALN_{line+1}"]["RAW.CIGAR"] = line_content[-1] + ## Adding tags - aln_dict[f"ALN_{line+1}"]["TAGS"] = ",".join(line_content[13:]) + aln_dict[f"ALN_{line+1}"]["TAGS"] = ",".join(line_content[13:-1]) # Getting nodes of interest ids aln_nodes = np.unique([ @@ -168,74 +201,108 @@ for line in gfa_lines: del gfa_lines -# Getting the start and end position of alignment nodes on each paths -print(f"[gaf2aln::Graph processing] Computing nodes positions ...") +# Creating GA (Graph alignment) dictionnary storing given info : +# {<ALN_ID> : +# { <PATH.ID> : [ +# { NODE.ID, P.ORIENT, A.ORIENT, P.POS, A.POS, CG } +# ] +# } +# } + +print(f"[gaf2aln::Graph alignment processing] Computing nodes positions in each paths...") +# Adding nodes positions relative to path for path_name in paths.keys(): print(f"[gaf2aln::Graph processing] Running on {path_name} ...") cur_pos = 0 for path_node in paths[path_name]["NODES"].keys(): try : nodes[path_node]["PATHS"][path_name] = (cur_pos, cur_pos+nodes_length[path_node]) - cur_pos += nodes_length[path_node] + cur_pos += nodes_length[path_node]+1 except : - cur_pos += nodes_length[path_node] - -final_aln = [] - -# Reconstructing alignments for each path -print(f"[gaf2aln::Alignment processing] Computing alignments ...") -for aln_name in aln_dict.keys(): - - print(f"[gaf2aln::Alignment processing] Looking into alignment {aln_name} ...") - aln_dict[aln_name]["HAP.MATCH"] = {} + cur_pos += nodes_length[path_node]+1 - for path_name in paths.keys(): - - print(f"[gaf2aln::Alignment processing] Running on {path_name} ...") - cur_aln = [] - in_aln = False # Keeping track of if we are in an alignment +# Calculating CIGAR for each nodes in each aln +print(f"[gaf2aln::CIGAR processing] Computing nodes cigar from alignement ...") +# Iterating over alignments +for aln in aln_dict.keys(): + + print(f"[gaf2aln::CIGAR processing] Running on {aln} ...") + # Getting the list of base level alignement (["=", "X", ...] from "1=1X...") + raw_cigar = cigar2basealn(aln_dict[aln]["RAW.CIGAR"]) + + cur_pos = 0 + print(len(raw_cigar)) + CIGAR={} + aln_nodes_id = list(aln_dict[aln]["PATH.MATCH"].keys()) + for node_id in aln_nodes_id: + + # Starting aln node + if node_id == aln_nodes_id[0] : + _cigar = basealn2cigar(raw_cigar[ + 0:nodes_length[node_id]-int(aln_dict[aln]["ALN.START"]) + ]) + cur_pos += nodes_length[node_id]-int(aln_dict[aln]["ALN.START"]) + # Last aln node + elif node_id == aln_nodes_id[-1]: + _cigar = basealn2cigar(raw_cigar[cur_pos:]) + else : + _cigar = basealn2cigar(raw_cigar[cur_pos:cur_pos+nodes_length[node_id]]) + cur_pos += nodes_length[node_id] + + CIGAR[node_id] = _cigar - # Traversing alignment path - for node_id, strand_on_aln in aln_dict[aln_name]["PATH.MATCH"].items(): - strand_on_path = paths[path_name]["NODES"][node_id] + aln_dict[aln]["CIGAR"] = CIGAR + print(CIGAR) - # Checking strand - if strand_on_aln == strand_on_path : - strand = "S" - else : - strand = "I" - - # Checking if node is traversed by the current path - if path_name in nodes[node_id]["PATHS"].keys(): - if not in_aln: - cur_aln.append({ - "IN": True, - "NODES": { - node_id: strand - } - }) - in_aln = True - - else : cur_aln[-1]["NODES"][node_id] = strand +GA = {} +# Computing alignments nodes positions in paths +print(f"[gaf2aln::Graph alignment processing] Lifting alignments coordinates paths positions...") +for aln_name in aln_dict.keys(): + GA[aln_name] = {} + aln_pos = 0 + _ = list(aln_dict[aln]["PATH.MATCH"].keys()) + start_end_ids = _[0], _[-1] + + for node_id, orient in aln_dict[aln_name]["PATH.MATCH"].items(): + for path_name in nodes[node_id]["PATHS"].keys(): + if node_id == start_end_ids[0] : + _apos = (aln_dict[aln_name]["QRY.START"], nodes_length[node_id]) + _ppos = ( + nodes[node_id]["PATHS"][path_name][0]+int(aln_dict[aln_name]["ALN.START"]), + nodes[node_id]["PATHS"][path_name][1] + ) + elif node_id == start_end_ids[-1]: + _ppos = ( + nodes[node_id]["PATHS"][path_name][0], + nodes[node_id]["PATHS"][path_name][0]+(int(aln_dict[aln_name]["ALN.BLOCK.LEN"]) - aln_pos) + ) + _apos = (aln_pos, aln_dict[aln_name]["END"]) else : - if in_aln or not len(cur_aln): - cur_aln.append({ - "IN": False, - "NODES": { - node_id: strand - } - }) - in_aln = False - - else : cur_aln[-1]["NODES"][node_id] = strand + _ppos = nodes[node_id]["PATHS"][path_name] + #_apos = (aln_pos, aln_pos+nodes_length[node_id]) - aln_dict[aln_name]["HAP.MATCH"][path_name] = cur_aln.copy() - print(f"{path_name} :", cur_aln) + + _dict = { + "NODE.ID": node_id, + "P.ORIENT": paths[path_name]["NODES"][node_id], + "A.ORIENT": aln_dict[aln_name]["PATH.MATCH"][node_id], + "P.POS": _ppos, + #"A.POS": _apos, + "CIGAR": aln_dict[aln_name]["CIGAR"][node_id] + } - # Traversing the HAP.MATCH to get alignment - for + try : + GA[aln_name][path_name].append(_dict) + except : + GA[aln_name][path_name] = [_dict] + +print(GA) + +# Creating the final dictionnary called LA (Linear alignment): +# Here we merged previous results to get full alignments +for -- GitLab From 9c80010ecf0c5d79f0060653e9aaf5acd5f92319 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 17 May 2024 13:46:21 +0200 Subject: [PATCH 07/30] Update gaf2aln.py --- gaf2aln.py | 48 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/gaf2aln.py b/gaf2aln.py index afc61e2..e8f8406 100644 --- a/gaf2aln.py +++ b/gaf2aln.py @@ -156,7 +156,7 @@ nodes_length = {} # Nodes dictionnary structured as follow : # {<ALN.NODE.ID> : {PATHS: {<PATH.NAME>: (start, end)}}} nodes = { - node_id: {"PATHS": {}} for node_id in aln_nodes + node_id: {"PATHS": {}, "ALN": {}} for node_id in aln_nodes } # Paths dictionnary structured as follow : # {<PATH.NAME>: {NODES: {<NODE.ID>: <NODE.ORIENT>}, CIGAR: <CIGAR in comma separated list>} @@ -221,6 +221,29 @@ for path_name in paths.keys(): except : cur_pos += nodes_length[path_node]+1 +print(f"[gaf2aln::Graph alignment processing] Computing nodes positions in each alignement...") +# Adding nodes positions relative to path +for aln_name in aln_dict.keys(): + print(f"[gaf2aln::Graph processing] Running on {aln_name} ...") + cur_pos = 0 + _ = list(aln_dict[aln_name]["PATH.MATCH"].keys()) + start_end_id = (_[0], _[-1]) + for node_id in aln_dict[aln_name]["PATH.MATCH"].keys(): + if node_id == start_end_id[0]: + start_pos = int(aln_dict[aln_name]["ALN.START"]) + end_pos = nodes_length[node_id]-int(aln_dict[aln_name]["ALN.START"]) + elif node_id == start_end_id[1]: + start_pos = cur_pos + end_pos = int(aln_dict[aln_name]["ALN.END"]) + else : + start_pos = cur_pos + end_pos = cur_pos+nodes_length[node_id] + + nodes[node_id]["ALN"][aln_name] = (start_pos, end_pos) + cur_pos = end_pos+1 + +print(nodes) + # Calculating CIGAR for each nodes in each aln print(f"[gaf2aln::CIGAR processing] Computing nodes cigar from alignement ...") # Iterating over alignments @@ -260,14 +283,12 @@ GA = {} print(f"[gaf2aln::Graph alignment processing] Lifting alignments coordinates paths positions...") for aln_name in aln_dict.keys(): GA[aln_name] = {} - aln_pos = 0 _ = list(aln_dict[aln]["PATH.MATCH"].keys()) start_end_ids = _[0], _[-1] for node_id, orient in aln_dict[aln_name]["PATH.MATCH"].items(): for path_name in nodes[node_id]["PATHS"].keys(): if node_id == start_end_ids[0] : - _apos = (aln_dict[aln_name]["QRY.START"], nodes_length[node_id]) _ppos = ( nodes[node_id]["PATHS"][path_name][0]+int(aln_dict[aln_name]["ALN.START"]), nodes[node_id]["PATHS"][path_name][1] @@ -275,12 +296,10 @@ for aln_name in aln_dict.keys(): elif node_id == start_end_ids[-1]: _ppos = ( nodes[node_id]["PATHS"][path_name][0], - nodes[node_id]["PATHS"][path_name][0]+(int(aln_dict[aln_name]["ALN.BLOCK.LEN"]) - aln_pos) + nodes[node_id]["PATHS"][path_name][0]+(int(aln_dict[aln_name]["ALN.END"]) - nodes[node_id]["ALN"][aln_name][0]) ) - _apos = (aln_pos, aln_dict[aln_name]["END"]) else : _ppos = nodes[node_id]["PATHS"][path_name] - #_apos = (aln_pos, aln_pos+nodes_length[node_id]) _dict = { @@ -288,7 +307,6 @@ for aln_name in aln_dict.keys(): "P.ORIENT": paths[path_name]["NODES"][node_id], "A.ORIENT": aln_dict[aln_name]["PATH.MATCH"][node_id], "P.POS": _ppos, - #"A.POS": _apos, "CIGAR": aln_dict[aln_name]["CIGAR"][node_id] } @@ -302,8 +320,22 @@ print(GA) # Creating the final dictionnary called LA (Linear alignment): # Here we merged previous results to get full alignments -for +for aln_name in GA.keys(): + for path_name, node_list in GA[aln_name].items(): + + contiguity = False + orient = 1 + alns = [] + + for node_data in node_list: + + cur_orient = (node_data["P.ORIENT"] == node_data["A.ORIENT"]) + + if not contiguity : + alns.append( + {"Q.START": nodes[node_data["NODE.ID"]]["ALN"][aln_name]} + ) -- GitLab From 662abb3a4d5c573fc1f965bce54729876d9e858b Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 17 May 2024 18:58:10 +0200 Subject: [PATCH 08/30] Update gaf2aln.py --- gaf2aln.py | 224 +++++++++++++++++++++++++++++------------------------ 1 file changed, 122 insertions(+), 102 deletions(-) diff --git a/gaf2aln.py b/gaf2aln.py index e8f8406..e9850b3 100644 --- a/gaf2aln.py +++ b/gaf2aln.py @@ -78,12 +78,10 @@ def cigar2basealn(cigar): For example : "345=" -> ["=", "=", ..., "="] of length 345. """ _ = re.findall(r'\d+\D', cigar) - print(_) final_cigar = [] for match in _: final_cigar += [match[-1]]*int(match[:-1]) - print(final_cigar) return final_cigar def basealn2cigar(base_aln_list): @@ -125,10 +123,10 @@ for line in range(len(gaf_lines)): } ## Splitting "PATH.MATCH" into a list - aln_dict[f"ALN_{line+1}"]["PATH.MATCH"] = { - str(node_id[:-1]): node_id[-1] + aln_dict[f"ALN_{line+1}"]["PATH.MATCH"] = [ + (str(node_id[:-1]), node_id[-1]) for node_id in walk2path(aln_dict[f"ALN_{line+1}"]["PATH.MATCH"]) - } + ] ## Adding CIGAR aln_dict[f"ALN_{line+1}"]["RAW.CIGAR"] = line_content[-1] @@ -140,7 +138,7 @@ for line in range(len(gaf_lines)): aln_nodes = np.unique([ str(node_id) for aln in aln_dict.keys() - for node_id in aln_dict[aln]["PATH.MATCH"].keys() + for node_id, orient in aln_dict[aln]["PATH.MATCH"] ]).tolist() del gaf_lines, gaf_col @@ -154,10 +152,12 @@ with open(args.gfa, 'r') as file: # {<NODE.ID>: <NODE.LENGTH>} nodes_length = {} # Nodes dictionnary structured as follow : -# {<ALN.NODE.ID> : {PATHS: {<PATH.NAME>: (start, end)}}} -nodes = { - node_id: {"PATHS": {}, "ALN": {}} for node_id in aln_nodes -} +# { <ALN.NODE.ID> : { +# <PATH.NAME>: {"START": start, "END": end, "STRAND": strand), +# <ALN.NAME>: {"START": start, "END": end, "S.OFF": start.offset, "E.OFF": end.offset, "STRAND": strand, "CIGAR": CIGAR} +# } +# } +nodes = {node_id: {} for node_id in aln_nodes} # Paths dictionnary structured as follow : # {<PATH.NAME>: {NODES: {<NODE.ID>: <NODE.ORIENT>}, CIGAR: <CIGAR in comma separated list>} paths = {} @@ -201,48 +201,84 @@ for line in gfa_lines: del gfa_lines -# Creating GA (Graph alignment) dictionnary storing given info : -# {<ALN_ID> : -# { <PATH.ID> : [ -# { NODE.ID, P.ORIENT, A.ORIENT, P.POS, A.POS, CG } -# ] -# } -# } - print(f"[gaf2aln::Graph alignment processing] Computing nodes positions in each paths...") # Adding nodes positions relative to path for path_name in paths.keys(): print(f"[gaf2aln::Graph processing] Running on {path_name} ...") cur_pos = 0 + + # Iterating over nodes in the path for path_node in paths[path_name]["NODES"].keys(): + # Instead of checking if the node is one interesting node, we try to add to the nodes dict try : - nodes[path_node]["PATHS"][path_name] = (cur_pos, cur_pos+nodes_length[path_node]) + nodes[path_node][path_name] = { + "START": cur_pos, # Start position of the node start in the currrent path + "END": cur_pos+nodes_length[path_node], # End position of the node end in the current path + "STRAND": paths[path_name]["NODES"][node_id] # Orientation of the node in the current path + } + cur_pos += nodes_length[path_node]+1 except : cur_pos += nodes_length[path_node]+1 print(f"[gaf2aln::Graph alignment processing] Computing nodes positions in each alignement...") # Adding nodes positions relative to path -for aln_name in aln_dict.keys(): - print(f"[gaf2aln::Graph processing] Running on {aln_name} ...") + +def get_aln_node_info(aln_name, aln_dict = aln_dict, nodes_length = nodes_length): + # Initializing current position in query cur_pos = 0 - _ = list(aln_dict[aln_name]["PATH.MATCH"].keys()) - start_end_id = (_[0], _[-1]) - for node_id in aln_dict[aln_name]["PATH.MATCH"].keys(): + + # Getting start and end node ids + start_end_id = (aln_dict[aln_name]["PATH.MATCH"][0][0], aln_dict[aln_name]["PATH.MATCH"][-1][0]) + + # Creating result dictionnary + res = {} + + ## Iterating over node_ids from the given alignment + for node_id, orient in aln_dict[aln_name]["PATH.MATCH"]: + # Adding entry for current node + res[node_id] = {aln_name: {}} + + # First node if node_id == start_end_id[0]: - start_pos = int(aln_dict[aln_name]["ALN.START"]) - end_pos = nodes_length[node_id]-int(aln_dict[aln_name]["ALN.START"]) + start_pos = 0 + s_off = int(aln_dict[aln_name]["ALN.START"]) + end_pos = nodes_length[node_id]-s_off + e_off = 0 + # End node elif node_id == start_end_id[1]: start_pos = cur_pos - end_pos = int(aln_dict[aln_name]["ALN.END"]) + s_off = 0 + end_pos = int(aln_dict[aln_name]["QRY.END"]) + e_off = nodes_length[node_id]-(end_pos-cur_pos) + # Node in between else : start_pos = cur_pos + s_off, e_off = 0, 0 end_pos = cur_pos+nodes_length[node_id] - nodes[node_id]["ALN"][aln_name] = (start_pos, end_pos) - cur_pos = end_pos+1 + res[node_id] = { + "START": start_pos, # Start position on the query + "END": end_pos, # End position on the query + "S.OFF": s_off, # Offset between the start of the alignment and the node's start + "E.OFF": e_off, # Offset between the end of the alignment and the node's end + "STRAND": orient # Orientation of the node in the alignment + } + + cur_pos = end_pos + print(start_pos, end_pos, s_off, e_off, orient, nodes_length[node_id], cur_pos) -print(nodes) + return res + +# Storing alignement +aln_processing = {} +for aln_name in aln_dict.keys(): + print(f"[gaf2aln::Graph processing] Running on {aln_name} ...") + + _ = get_aln_node_info(aln_name, aln_dict = aln_dict, nodes_length = nodes_length) + + for node_id, res in _.items(): + nodes[node_id][aln_name] = res # Calculating CIGAR for each nodes in each aln print(f"[gaf2aln::CIGAR processing] Computing nodes cigar from alignement ...") @@ -252,90 +288,74 @@ for aln in aln_dict.keys(): print(f"[gaf2aln::CIGAR processing] Running on {aln} ...") # Getting the list of base level alignement (["=", "X", ...] from "1=1X...") raw_cigar = cigar2basealn(aln_dict[aln]["RAW.CIGAR"]) - - cur_pos = 0 - print(len(raw_cigar)) CIGAR={} - aln_nodes_id = list(aln_dict[aln]["PATH.MATCH"].keys()) - for node_id in aln_nodes_id: - - # Starting aln node - if node_id == aln_nodes_id[0] : - _cigar = basealn2cigar(raw_cigar[ - 0:nodes_length[node_id]-int(aln_dict[aln]["ALN.START"]) - ]) - cur_pos += nodes_length[node_id]-int(aln_dict[aln]["ALN.START"]) - # Last aln node - elif node_id == aln_nodes_id[-1]: - _cigar = basealn2cigar(raw_cigar[cur_pos:]) - else : - _cigar = basealn2cigar(raw_cigar[cur_pos:cur_pos+nodes_length[node_id]]) - cur_pos += nodes_length[node_id] - CIGAR[node_id] = _cigar - - aln_dict[aln]["CIGAR"] = CIGAR - print(CIGAR) + for node_id, orient in aln_dict[aln]["PATH.MATCH"]: + + _cigar = basealn2cigar(raw_cigar[ + nodes[node_id][aln]["START"]:nodes[node_id][aln]["END"] + ]) + nodes[node_id][aln]["CIGAR"] = _cigar + #print(_cigar, nodes[node_id][aln]["START"], nodes[node_id][aln]["END"]) + +#print(nodes) -GA = {} +# Lifting graph alignements to haplotype alignements -# Computing alignments nodes positions in paths -print(f"[gaf2aln::Graph alignment processing] Lifting alignments coordinates paths positions...") +ALNS = {} for aln_name in aln_dict.keys(): - GA[aln_name] = {} - _ = list(aln_dict[aln]["PATH.MATCH"].keys()) - start_end_ids = _[0], _[-1] - - for node_id, orient in aln_dict[aln_name]["PATH.MATCH"].items(): - for path_name in nodes[node_id]["PATHS"].keys(): - if node_id == start_end_ids[0] : - _ppos = ( - nodes[node_id]["PATHS"][path_name][0]+int(aln_dict[aln_name]["ALN.START"]), - nodes[node_id]["PATHS"][path_name][1] - ) - elif node_id == start_end_ids[-1]: - _ppos = ( - nodes[node_id]["PATHS"][path_name][0], - nodes[node_id]["PATHS"][path_name][0]+(int(aln_dict[aln_name]["ALN.END"]) - nodes[node_id]["ALN"][aln_name][0]) - ) - else : - _ppos = nodes[node_id]["PATHS"][path_name] - - - _dict = { - "NODE.ID": node_id, - "P.ORIENT": paths[path_name]["NODES"][node_id], - "A.ORIENT": aln_dict[aln_name]["PATH.MATCH"][node_id], - "P.POS": _ppos, - "CIGAR": aln_dict[aln_name]["CIGAR"][node_id] - } + + for path_name in paths.keys(): + ALNS[(path_name, aln_name)] = [] + _ = [] + for node_id, orient in aln_dict[aln_name]["PATH.MATCH"].items(): + + n_info = nodes[node_id] try : - GA[aln_name][path_name].append(_dict) - except : - GA[aln_name][path_name] = [_dict] - -print(GA) - -# Creating the final dictionnary called LA (Linear alignment): -# Here we merged previous results to get full alignments + if n_info[aln_name]["STRAND"] == n_info[path_name]["STRAND"] : + t_start = n_info[path_name]["START"]+n_info[aln_name]["S.OFF"] + t_end = n_info[path_name]["END"]+n_info[aln_name]["E.OFF"] + else : + t_end = n_info[path_name]["START"]+n_info[aln_name]["S.OFF"] + t_start = n_info[path_name]["END"]+n_info[aln_name]["E.OFF"] + + q_start = n_info[aln_name]["START"] + q_end = n_info[aln_name]["END"] + _CG = n_info[aln_name]["CIGAR"] + + # Non empty temporary list of aln and ending of the last block is the same as the start of the new node : + if len(_) and _[-1]["T.END"] == t_start and _[-1]["Q.END"] == q_start: + tmp_aln["Q.END"] = q_end + tmp_aln["T.END"] = t_end + tmp_aln["CG"] += _CG + elif len(_) and _[-1]["T.END"] == t_start: # Following on the target not on the query (i.e. Insertion) + tmp_aln["T.END"] = t_end + tmp_aln["CG"] += f"{nodes_length[node_id]}I" + elif len(_) and _[-1]["Q.END"] == q_start: # Following on the query, not on the target (i.e. Deletion) + tmp_aln["Q.END"] = q_end + tmps_aln["CG"] += f"{nodes_length[node_id]}D" + else : # Else, completely different + tmp_aln = { + "Q.START": q_start , + "Q.END": q_end, + "T.START": t_start, + "T.END": t_end, + "CG": _CG, + } + except: + # Node is not in the path + tmp_aln -for aln_name in GA.keys(): + - for path_name, node_list in GA[aln_name].items(): - contiguity = False - orient = 1 - alns = [] - - for node_data in node_list: - cur_orient = (node_data["P.ORIENT"] == node_data["A.ORIENT"]) + + + + - if not contiguity : - alns.append( - {"Q.START": nodes[node_data["NODE.ID"]]["ALN"][aln_name]} - ) -- GitLab From a12ec03c1c4e18dea839511ffcf2b6d893cead6b Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 17 May 2024 20:38:15 +0200 Subject: [PATCH 09/30] Update gaf2aln.py --- gaf2aln.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/gaf2aln.py b/gaf2aln.py index e9850b3..60794ea 100644 --- a/gaf2aln.py +++ b/gaf2aln.py @@ -309,9 +309,13 @@ for aln_name in aln_dict.keys(): ALNS[(path_name, aln_name)] = [] _ = [] - for node_id, orient in aln_dict[aln_name]["PATH.MATCH"].items(): + for node_id, orient in aln_dict[aln_name]["PATH.MATCH"]: n_info = nodes[node_id] + q_start = n_info[aln_name]["START"] + q_end = n_info[aln_name]["END"] + _CG = n_info[aln_name]["CIGAR"] + try : if n_info[aln_name]["STRAND"] == n_info[path_name]["STRAND"] : t_start = n_info[path_name]["START"]+n_info[aln_name]["S.OFF"] @@ -320,10 +324,6 @@ for aln_name in aln_dict.keys(): t_end = n_info[path_name]["START"]+n_info[aln_name]["S.OFF"] t_start = n_info[path_name]["END"]+n_info[aln_name]["E.OFF"] - q_start = n_info[aln_name]["START"] - q_end = n_info[aln_name]["END"] - _CG = n_info[aln_name]["CIGAR"] - # Non empty temporary list of aln and ending of the last block is the same as the start of the new node : if len(_) and _[-1]["T.END"] == t_start and _[-1]["Q.END"] == q_start: tmp_aln["Q.END"] = q_end @@ -337,7 +337,7 @@ for aln_name in aln_dict.keys(): tmps_aln["CG"] += f"{nodes_length[node_id]}D" else : # Else, completely different tmp_aln = { - "Q.START": q_start , + "Q.START": q_start, "Q.END": q_end, "T.START": t_start, "T.END": t_end, @@ -345,7 +345,15 @@ for aln_name in aln_dict.keys(): } except: # Node is not in the path - tmp_aln + tmp_aln = { + "Q.START": q_start, + "Q.END": q_end, + "T.START": -1, + "T.END": -1, + "CG": f"{nodes_length[node_id]}D" + } + +print(ALNS) -- GitLab From a7751c03ea016645940f4c6653c471c7805ca342 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Tue, 21 May 2024 17:07:23 +0200 Subject: [PATCH 10/30] Update gaf2aln.py --- gaf2aln.py | 129 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 104 insertions(+), 25 deletions(-) diff --git a/gaf2aln.py b/gaf2aln.py index 60794ea..7657046 100644 --- a/gaf2aln.py +++ b/gaf2aln.py @@ -167,6 +167,55 @@ links = {} # Parsing the gfa print(f"[gaf2aln::GFA Parser] Extracting nodes, paths and links ...") + +def GFA_parser(gfa_lines, nodes = nodes): + _links, _nodes, _nodes_length, paths = {}, {}, {}, {} + for line in gfa_lines: + line_content = line[:-1].split("\t") + line_id = line_content[0] + + # Segment line + if line_id == "S" : + + _nodes_length[str(line_content[1])] = len(line_content[2]) + + # Link line + elif line_id == "L": + try : + _links[str(line_content[1])][str(line_content[3])] = { + "FROM": str(line_content[2]), + "TO": str(line_content[4]) + } + + except : + _links[str(line_content[1])] = { + str(line_content[3]) : {"FROM.ORIENT": str(line_content[2]), "TO.ORIENT": str(line_content[4])} + } + + # Path line + elif line_id == "P": + _paths[str(line_content[1])] = { + "NODES": { + str(node_id[:-1]): str(node_id[-1]) + for node_id in line_content[2].split(',') + }, + "CIGAR": line_content[3] + } + + return [_links, _nodes, _nodes_length, _paths] + +# splits = np.quantile(range(len(gfa_lines)+1), q= np.array(args.threads+1)/args.threads, method='higher').tolist() +# res = [] +# for i in range(1, len(splits)): +# res.append(executor.submit(GFA_parser, gfa_lines[splits[i-1]:splits[i]])) + +# for out in res: +# results = out.result() + +# for link_id, link_info in results[0].items(): +# links[] + + for line in gfa_lines: line_content = line[:-1].split("\t") line_id = line_content[0] @@ -201,27 +250,44 @@ for line in gfa_lines: del gfa_lines -print(f"[gaf2aln::Graph alignment processing] Computing nodes positions in each paths...") -# Adding nodes positions relative to path -for path_name in paths.keys(): - print(f"[gaf2aln::Graph processing] Running on {path_name} ...") +print(f"[gaf2aln::Graph position processing] Computing nodes positions in each paths...") +def get_node_pos(path_name, nodes = nodes, paths = paths, nodes_length = nodes_length): + print(f"[gaf2aln::Graph position processing] Running on {path_name} ...") cur_pos = 0 + out = {} # Iterating over nodes in the path for path_node in paths[path_name]["NODES"].keys(): # Instead of checking if the node is one interesting node, we try to add to the nodes dict - try : - nodes[path_node][path_name] = { + if path_node in aln_nodes : + out[path_node] = { "START": cur_pos, # Start position of the node start in the currrent path "END": cur_pos+nodes_length[path_node], # End position of the node end in the current path - "STRAND": paths[path_name]["NODES"][node_id] # Orientation of the node in the current path + "STRAND": paths[path_name]["NODES"][path_node] # Orientation of the node in the current path } cur_pos += nodes_length[path_node]+1 - except : + else : cur_pos += nodes_length[path_node]+1 -print(f"[gaf2aln::Graph alignment processing] Computing nodes positions in each alignement...") + return out + +res = {} +executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.threads) +# Adding nodes positions relative to path +for path_name in paths.keys(): + res[path_name] = executor.submit(get_node_pos, path_name) + +executor.shutdown(wait=True) + +for path_name, out in res.items(): + results = out.result() + for path_node, node_pos in results.items(): + nodes[path_node][path_name] = node_pos + +del res + +print(f"[gaf2aln::Alignment position processing] Computing nodes positions in each alignement...") # Adding nodes positions relative to path def get_aln_node_info(aln_name, aln_dict = aln_dict, nodes_length = nodes_length): @@ -271,14 +337,22 @@ def get_aln_node_info(aln_name, aln_dict = aln_dict, nodes_length = nodes_length return res # Storing alignement -aln_processing = {} +res = {} +executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.threads) for aln_name in aln_dict.keys(): - print(f"[gaf2aln::Graph processing] Running on {aln_name} ...") + print(f"[gaf2aln::Alignment position processing] Running on {aln_name} ...") - _ = get_aln_node_info(aln_name, aln_dict = aln_dict, nodes_length = nodes_length) + res[aln_name] = executor.submit(get_aln_node_info, aln_name) + #res[aln_name] = get_aln_node_info(aln_name, aln_dict = aln_dict, nodes_length = nodes_length) + +executor.shutdown(wait=True) - for node_id, res in _.items(): - nodes[node_id][aln_name] = res +for aln_name, node_info in res.items(): + results = node_info.result() + for node_id, info in results.items(): + nodes[node_id][aln_name] = info + +del res # Calculating CIGAR for each nodes in each aln print(f"[gaf2aln::CIGAR processing] Computing nodes cigar from alignement ...") @@ -306,7 +380,6 @@ ALNS = {} for aln_name in aln_dict.keys(): for path_name in paths.keys(): - ALNS[(path_name, aln_name)] = [] _ = [] for node_id, orient in aln_dict[aln_name]["PATH.MATCH"]: @@ -316,7 +389,10 @@ for aln_name in aln_dict.keys(): q_end = n_info[aln_name]["END"] _CG = n_info[aln_name]["CIGAR"] - try : + print(node_id, path_name, q_start, q_end) + if path_name in list(n_info.keys()): + print("\tIn path") + if n_info[aln_name]["STRAND"] == n_info[path_name]["STRAND"] : t_start = n_info[path_name]["START"]+n_info[aln_name]["S.OFF"] t_end = n_info[path_name]["END"]+n_info[aln_name]["E.OFF"] @@ -324,6 +400,8 @@ for aln_name in aln_dict.keys(): t_end = n_info[path_name]["START"]+n_info[aln_name]["S.OFF"] t_start = n_info[path_name]["END"]+n_info[aln_name]["E.OFF"] + print("\t", t_start, t_end) + # Non empty temporary list of aln and ending of the last block is the same as the start of the new node : if len(_) and _[-1]["T.END"] == t_start and _[-1]["Q.END"] == q_start: tmp_aln["Q.END"] = q_end @@ -343,17 +421,18 @@ for aln_name in aln_dict.keys(): "T.END": t_end, "CG": _CG, } - except: + print("\t", tmp_aln) + + else : + print("\tNot in path") # Node is not in the path - tmp_aln = { - "Q.START": q_start, - "Q.END": q_end, - "T.START": -1, - "T.END": -1, - "CG": f"{nodes_length[node_id]}D" - } -print(ALNS) + _.append(tmp_aln) + ALNS[(path_name, aln_name)] = _ + +## Debug +for elem in ALNS[("TO1000#1#chr03", "ALN_1")]: + print(elem) -- GitLab From 56990c58b0a19b66c42d98e3a5b62dd4a5c90d49 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 31 May 2024 13:42:58 +0200 Subject: [PATCH 11/30] Create Anchors2Path.py --- Anchors2Path.py | 166 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 Anchors2Path.py diff --git a/Anchors2Path.py b/Anchors2Path.py new file mode 100644 index 0000000..e99b48a --- /dev/null +++ b/Anchors2Path.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Anchors2Path +Give anchors nodes ids with their relative positions in a given path. + +@author: alexis.mergez@inrae.fr +@version: 0.1 +""" +import re +import argparse +import os +import numpy as np +import time +import pandas as pd +from functools import reduce +import concurrent.futures +import gzip + +version = "0.1" + +## Argument parser +arg_parser = argparse.ArgumentParser(description='Anchors2Path') +arg_parser.add_argument( + "--gfa", + "-g", + dest = "gfa", + required = True, + help = "GFA file" + ) +arg_parser.add_argument( + "--output", + "-o", + dest = "output", + required = True, + help = "Output name" + ) +arg_parser.add_argument( + "--threads", + "-t", + dest = "threads", + required = False, + default = 1, + type = int, + help = "Number of threads" + ) +arg_parser.add_argument( + '--version', + '-v', + action="store_true", + dest = "version", + help = "Show version" +) +arg_parser.add_argument( + '--progress', + '-P', + action="store_true", + dest = "progress", + help = "Show progress to stdout" +) +arg_parser.add_argument( + '--pathname', + '-r', + dest = "pathname", + required = True, + help = "Pathname" +) +args = arg_parser.parse_args() + +# Printing version and exiting if required +if args.version: + print(version) + os._exit(0) + +# Timing the script +start_time = time.time() + +## Reading the gfa into a list +# If not gzipped : +if args.gfa[-2:] != "gz" : + with open(args.gfa, 'r') as file: + gfaLines = file.readlines() + +# If gzipped : +else : + with gzip.open(args.gfa, 'r') as file: + gfaLines = [line.decode() for line in file.readlines()] + +# Progress message +if args.progress: print(f"[GFAstats::{panname}] Parsing gfa file...") + +# Initializing dictionnaries +Anchors = {} +## {<NODE_ID>: (path_start, path_end)} +nodes_length = {} +path_nodes = {} +## {<path_id>: <nodes_list>} + +for line in gfaLines[1:]: + + # Skipping comment lines + if line[0] == "#": + lineType = "#" + + # Reading 3 first columns of the current line + else : + lineType, uid, value = line[:-1].split('\t')[:3] + + if lineType == "S": # Segments = Nodes + nodes_length[int(uid)] = len(value) + + elif lineType == "P": # Paths + + path_nodes[uid] = [k[:-1] for k in value.split(",")] + +if args.progress: + print(f"[GFAstats::{panname}] Parsed in {round(time.time() - start_time, 2)}s") + +# Getting the list of anchor nodes +node_path_count = {} +## {<NODE_ID>: <Number of path traversing this node>} +# Computing number of path traversing each nodes +for path_id, node_list in path_nodes.items(): + for node_id in node_list: + + try : + node_path_count[node_id] += 1 + except : + node_path_count[node_id] = 1 + +# Searching anchors +n_path = len(list(path_nodes.keys())) + +for node_id, count in node_path_count.items(): + if count == n_path : + Anchors[node_id] = [] + +# Computing path position for each node of the path of interest +current_pos = 0 +for node_id in path_nodes[args.pathname]: + _end = current_pos + nodes_length[node_id] + + # Trying to add anchors path position if it is an anchor + try : + Anchors[node_id].append( (current_pos, _end) ) + except: + pass + + current_pos = _end + +# Transforming data into a table +ID, START, END = [], [], [] +for node_id, positions in Anchors.items(): + for start, end in positions: + ID.append(node_id) + START.append(start) + END.append(end) + +df = pd.DataFrame(data = { + "NODE_ID": ID, + "START": START, + "END": END +}) + +df.to_csv(args.output, sep="\t") + -- GitLab From 1296dd803f594d2aeea67cb8cc92d9c007492922 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 31 May 2024 13:49:06 +0200 Subject: [PATCH 12/30] Update Anchors2Path.py --- Anchors2Path.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Anchors2Path.py b/Anchors2Path.py index e99b48a..3006ef3 100644 --- a/Anchors2Path.py +++ b/Anchors2Path.py @@ -87,7 +87,7 @@ else : gfaLines = [line.decode() for line in file.readlines()] # Progress message -if args.progress: print(f"[GFAstats::{panname}] Parsing gfa file...") +if args.progress: print(f"[Anchors2Path] Parsing gfa file...") # Initializing dictionnaries Anchors = {} @@ -114,7 +114,7 @@ for line in gfaLines[1:]: path_nodes[uid] = [k[:-1] for k in value.split(",")] if args.progress: - print(f"[GFAstats::{panname}] Parsed in {round(time.time() - start_time, 2)}s") + print(f"[Anchors2Path] Parsed in {round(time.time() - start_time, 2)}s") # Getting the list of anchor nodes node_path_count = {} -- GitLab From c7d03713a9893274b3f3c106176bfc8bee9b2637 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 31 May 2024 13:52:15 +0200 Subject: [PATCH 13/30] Update Anchors2Path.py --- Anchors2Path.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Anchors2Path.py b/Anchors2Path.py index 3006ef3..2e64c83 100644 --- a/Anchors2Path.py +++ b/Anchors2Path.py @@ -111,7 +111,7 @@ for line in gfaLines[1:]: elif lineType == "P": # Paths - path_nodes[uid] = [k[:-1] for k in value.split(",")] + path_nodes[uid] = [int(k[:-1]) for k in value.split(",")] if args.progress: print(f"[Anchors2Path] Parsed in {round(time.time() - start_time, 2)}s") @@ -142,7 +142,7 @@ for node_id in path_nodes[args.pathname]: # Trying to add anchors path position if it is an anchor try : - Anchors[node_id].append( (current_pos, _end) ) + Anchors[int(node_id)].append( (current_pos, _end) ) except: pass -- GitLab From 537598f921ec0f2a987f9554109802399fafccdb Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 31 May 2024 13:56:17 +0200 Subject: [PATCH 14/30] Update Anchors2Path.py --- Anchors2Path.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Anchors2Path.py b/Anchors2Path.py index 2e64c83..3111c6f 100644 --- a/Anchors2Path.py +++ b/Anchors2Path.py @@ -162,5 +162,5 @@ df = pd.DataFrame(data = { "END": END }) -df.to_csv(args.output, sep="\t") +df.to_csv(args.output, sep="\t", index = False) -- GitLab From 745eee2cd00e33eb46ce7410baab3f15fdb05ac1 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 3 Jun 2024 13:20:47 +0200 Subject: [PATCH 15/30] Update --- .ipynb_checkpoints/gaf2aln-checkpoint.ipynb | 1723 +++++++++++++ gaf2aln.ipynb | 2443 +++++++++++++++++++ gaf2aln.py | 38 +- 3 files changed, 4180 insertions(+), 24 deletions(-) create mode 100644 .ipynb_checkpoints/gaf2aln-checkpoint.ipynb create mode 100644 gaf2aln.ipynb diff --git a/.ipynb_checkpoints/gaf2aln-checkpoint.ipynb b/.ipynb_checkpoints/gaf2aln-checkpoint.ipynb new file mode 100644 index 0000000..effb776 --- /dev/null +++ b/.ipynb_checkpoints/gaf2aln-checkpoint.ipynb @@ -0,0 +1,1723 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "4ffaf9f6-cc1e-4190-9351-5431c930d25b", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import argparse\n", + "import concurrent.futures\n", + "import os\n", + "import re\n", + "\n", + "# Replace for argparse arguments\n", + "class arguments():\n", + " gfa = \"/home/amergez/Documents/Scratch/LeChou/35Bra-v2a/35Bra-v2a.chr03.gfa\"\n", + " gaf = \"/home/amergez/Documents/Scratch/LeChou/35Bra-v2a/Mapping2Graph/GA.FLC2.aln.gaf\"\n", + " threads = 8\n", + " version = False\n", + "args = arguments()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "280c8847-22e8-4063-bde8-3e4e72cf20e7", + "metadata": {}, + "outputs": [], + "source": [ + "# Toolbox\n", + "def walk2path(walk):\n", + " \"\"\"\n", + " Takes a walk in a single string and returns a list of nodes id with signs (gfa v1 like)\n", + " \"\"\"\n", + " _ = re.findall(r'>\\w+|<\\w+', walk)\n", + " # Converting ['>..', '>..', '<..', '>..'] to '..+,..+,..-,..+'\n", + " return [f'{elem[1:]}{(elem[0] == \">\")*\"+\"+(elem[0] == \"<\")*\"-\"}' for elem in _]\n", + "\n", + "def cigar2basealn(cigar):\n", + " \"\"\"\n", + " Takes a CIGAR string and convert it into a list of base level alignment.\n", + " For example : \"345=\" -> [\"=\", \"=\", ..., \"=\"] of length 345.\n", + " \"\"\"\n", + " _ = re.findall(r'\\d+\\D', cigar)\n", + " final_cigar = []\n", + " for match in _:\n", + " final_cigar += [match[-1]]*int(match[:-1])\n", + "\n", + " return final_cigar\n", + "\n", + "def basealn2cigar(base_aln_list):\n", + " \n", + " last_elem = base_aln_list[0]\n", + " CIGAR = [[1, last_elem]]\n", + " for elem in base_aln_list[1:]:\n", + " if elem == last_elem:\n", + " CIGAR[-1][0] += 1\n", + "\n", + " else :\n", + " CIGAR[-1][0] = str(CIGAR[-1][0])\n", + " CIGAR.append([1, elem])\n", + " last_elem = elem\n", + " CIGAR[-1][0] = str(CIGAR[-1][0])\n", + " return \"\".join([\"\".join(block) for block in CIGAR if block[1] != \"\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "be12e9d4-de76-4c8b-af84-6567549483f4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[gaf2aln::GAF Parser] Reading /home/amergez/Documents/Scratch/LeChou/35Bra-v2a/Mapping2Graph/GA.FLC2.aln.gaf ...\n", + "[gaf2aln::GAF Parser] Extracting alignments ...\n", + "{'ALN_1': {'QRY.NAME': 'FLC2.TO1000#1#chr03', 'QRY.LEN': '3735', 'QRY.START': '0', 'QRY.END': '3735', 'STRAND': '+', 'PATH.MATCH': [('7046526', '+'), ('7046528', '+'), ('7046530', '+'), ('7046531', '+'), ('7046532', '+'), ('7046533', '+'), ('7046534', '+'), ('7046536', '+'), ('7046537', '+'), ('7046539', '+'), ('7046541', '+'), ('7046542', '+'), ('7046544', '+'), ('7046546', '+'), ('7046547', '+'), ('7046549', '+'), ('7046551', '+'), ('7046552', '+'), ('7046554', '+'), ('7046556', '+'), ('7046556', '+'), ('7046556', '+'), ('7046557', '+'), ('7046558', '+'), ('7046559', '+'), ('7046560', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046562', '+'), ('7046564', '+'), ('7046565', '+'), ('7046567', '+'), ('7046568', '+'), ('7046570', '+'), ('7046571', '+'), ('7046573', '+'), ('7046574', '+'), ('7046576', '+'), ('7046577', '+'), ('7046579', '+'), ('7046581', '+'), ('7046583', '+'), ('7046584', '+'), ('7046586', '+'), ('7046587', '+'), ('7046589', '+'), ('7046590', '+'), ('7046592', '+'), ('7046593', '+'), ('7046594', '+'), ('7046596', '+'), ('7046597', '+'), ('7046599', '+'), ('7046600', '+'), ('7046601', '+'), ('7046603', '+'), ('7046604', '+'), ('7046606', '+'), ('7046608', '+'), ('7046609', '+'), ('7046621', '+'), ('7046622', '+'), ('7046624', '+'), ('7046625', '+'), ('7046626', '+'), ('7046628', '+'), ('7046631', '+'), ('7046673', '+'), ('7046631', '+'), ('7046673', '+'), ('7046631', '+'), ('7046632', '+'), ('7046634', '+'), ('7046635', '+'), ('7046637', '+'), ('7046638', '+'), ('7046639', '+'), ('7046641', '+'), ('7046644', '+'), ('7046646', '+'), ('7046647', '+'), ('7046649', '+'), ('7046650', '+'), ('7046652', '+'), ('7046653', '+'), ('7046654', '+'), ('7046656', '+'), ('7046657', '+'), ('7046659', '+'), ('7046660', '+'), ('7046662', '+'), ('7046663', '+'), ('7046665', '+'), ('7046667', '+'), ('7046668', '+'), ('7046670', '+'), ('7046671', '+'), ('7046674', '+'), ('7046675', '+'), ('7046674', '+'), ('7046675', '+'), ('7046676', '+'), ('7046678', '+'), ('7046679', '+'), ('7046680', '+'), ('7046682', '+'), ('7046684', '+'), ('7046685', '+'), ('7046686', '+'), ('7046688', '+'), ('7046690', '+'), ('7046692', '+'), ('7046693', '+'), ('7046695', '+'), ('7046696', '+'), ('7046698', '+'), ('7046700', '+'), ('7046702', '+'), ('7046703', '+'), ('7046704', '+'), ('7046706', '+'), ('7046707', '+'), ('7046709', '+'), ('7046710', '+'), ('7046712', '+'), ('7046713', '+'), ('7046715', '+'), ('7046718', '+'), ('7046717', '+'), ('7046718', '+'), ('7046717', '+'), ('7046718', '+'), ('7046720', '+'), ('7046722', '+'), ('7046724', '+'), ('7046725', '+'), ('7046727', '+'), ('7046728', '+'), ('7046729', '+'), ('7046730', '+'), ('7046731', '+'), ('7046733', '+'), ('7046735', '+'), ('7046736', '+'), ('7046738', '+'), ('7046739', '+'), ('7046740', '+'), ('7046738', '+'), ('7046739', '+'), ('7046740', '+'), ('7046738', '+'), ('7046739', '+'), ('7046741', '+')], 'PATH.LEN': '3822', 'ALN.START': '77', 'ALN.END': '3812', 'RES.MATCH': '3735', 'ALN.BLOCK.LEN': '3735', 'MAPPING.QUAL': '60', 'RAW.CIGAR': 'cg:Z:3735=', 'TAGS': 'AS:f:3735,dv:f:0,id:f:1'}, 'ALN_2': {'QRY.NAME': 'FLC2.TO1000#1#chr03', 'QRY.LEN': '3735', 'QRY.START': '0', 'QRY.END': '3735', 'STRAND': '+', 'PATH.MATCH': [('7594382', '+'), ('7594369', '+'), ('7594371', '+'), ('7594021', '+'), ('7594286', '+'), ('7594374', '+'), ('7594356', '+'), ('7594374', '+'), ('7594374', '+'), ('7594375', '+'), ('7594626', '+'), ('7594011', '+'), ('7594374', '+'), ('7594375', '+'), ('7594369', '+'), ('7594371', '+'), ('7594021', '+'), ('7594021', '+'), ('7594021', '+'), ('7594021', '+'), ('7594241', '+'), ('7594248', '+'), ('7594286', '+'), ('7594311', '+'), ('7594315', '+'), ('7594311', '+'), ('7594330', '+'), ('7594311', '+'), ('7594315', '+'), ('7594374', '+'), ('7594311', '+'), ('7594374', '+'), ('7594369', '+'), ('7594021', '+'), ('7594026', '+'), ('7594021', '+'), ('7594021', '+'), ('7594026', '+'), ('7594021', '+'), ('7594021', '+'), ('7594021', '+'), ('7594021', '+'), ('7594286', '+'), ('7594374', '+'), ('7594021', '+'), ('7594286', '+'), ('7594311', '+'), ('7594286', '+'), ('7594311', '+'), ('7594286', '+'), ('7594311', '+'), ('7594286', '+'), ('7594311', '+'), ('7594315', '+'), ('7594286', '+'), ('7594311', '+'), ('7594374', '+'), ('7594021', '+'), ('7594286', '+'), ('7594286', '+'), ('7594374', '+'), ('7594356', '+'), ('7594374', '+'), ('7594374', '+'), ('7594375', '+'), ('7594374', '+'), ('7594356', '+'), ('7594374', '+'), ('7594375', '+'), ('7594374', '+'), ('7594350', '+'), ('7594264', '+'), ('7594207', '+'), ('7594225', '+'), ('7594227', '+'), ('7594120', '+'), ('7594132', '+'), ('7594165', '+'), ('7594172', '+')], 'PATH.LEN': '61224', 'ALN.START': '0', 'ALN.END': '3735', 'RES.MATCH': '3734', 'ALN.BLOCK.LEN': '3735', 'MAPPING.QUAL': '0', 'RAW.CIGAR': 'cg:Z:57=1X3677=', 'TAGS': 'AS:f:3732.06,dv:f:0.000267738,id:f:0.999732'}}\n" + ] + } + ], + "source": [ + "# Parsing the .gaf file\n", + "print(f\"[gaf2aln::GAF Parser] Reading {args.gaf} ...\")\n", + "with open(args.gaf, 'r') as file:\n", + " gaf_lines = file.readlines()\n", + "\n", + "gaf_col = [\n", + " \"QRY.NAME\", \"QRY.LEN\", \"QRY.START\", \"QRY.END\", \"STRAND\", \n", + " \"PATH.MATCH\", \"PATH.LEN\", \"ALN.START\", \"ALN.END\",\n", + " \"RES.MATCH\", \"ALN.BLOCK.LEN\", \"MAPPING.QUAL\"\n", + " ]\n", + "\n", + "# Creating dictionnary to store alignments\n", + "print(f\"[gaf2aln::GAF Parser] Extracting alignments ...\")\n", + "aln_dict = {}\n", + "for line in range(len(gaf_lines)):\n", + " ## Splitting the line by tabulation\n", + " line_content = gaf_lines[line][:-1].split('\\t')\n", + "\n", + " ## Adding alignement info to dictionnary\n", + " aln_dict[f\"ALN_{line+1}\"] = {\n", + " gaf_col[i]: line_content[i] for i in range(len(gaf_col))\n", + " }\n", + " \n", + " ## Splitting \"PATH.MATCH\" into a list\n", + " aln_dict[f\"ALN_{line+1}\"][\"PATH.MATCH\"] = [\n", + " (str(node_id[:-1]), node_id[-1]) \n", + " for node_id in walk2path(aln_dict[f\"ALN_{line+1}\"][\"PATH.MATCH\"])\n", + " ]\n", + "\n", + " ## Adding CIGAR\n", + " aln_dict[f\"ALN_{line+1}\"][\"RAW.CIGAR\"] = line_content[-1]\n", + "\n", + " ## Adding tags\n", + " aln_dict[f\"ALN_{line+1}\"][\"TAGS\"] = \",\".join(line_content[13:-1])\n", + "\n", + "# Getting nodes of interest ids\n", + "aln_nodes = np.unique([\n", + " str(node_id) \n", + " for aln in aln_dict.keys() \n", + " for node_id, orient in aln_dict[aln][\"PATH.MATCH\"]\n", + "]).tolist()\n", + "\n", + "print(aln_dict)\n", + "del gaf_lines, gaf_col" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2f891424-0d88-4fd3-99ff-b0a8c90587ff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[gaf2aln::GFA Parser] Reading /home/amergez/Documents/Scratch/LeChou/35Bra-v2a/35Bra-v2a.chr03.gfa ...\n", + "[gaf2aln::GFA Parser] Extracting nodes, paths and links ...\n" + ] + } + ], + "source": [ + "# Parsing the .gfa\n", + "print(f\"[gaf2aln::GFA Parser] Reading {args.gfa} ...\")\n", + "with open(args.gfa, 'r') as file:\n", + " gfa_lines = file.readlines()\n", + "\n", + "# Nodes length dictionnary structured as follow :\n", + "# {<NODE.ID>: <NODE.LENGTH>}\n", + "nodes_length = {}\n", + "# Nodes dictionnary structured as follow :\n", + "# { <ALN.NODE.ID> : {\n", + "# <PATH.NAME>: {\"START\": start, \"END\": end, \"STRAND\": strand), \n", + "# <ALN.NAME>: {\"START\": start, \"END\": end, \"S.OFF\": start.offset, \"E.OFF\": end.offset, \"STRAND\": strand, \"CIGAR\": CIGAR}\n", + "# }\n", + "# }\n", + "nodes = {node_id: {} for node_id in aln_nodes}\n", + "# Paths dictionnary structured as follow :\n", + "# {<PATH.NAME>: {NODES: {<NODE.ID>: <NODE.ORIENT>}, CIGAR: <CIGAR in comma separated list>}\n", + "paths = {}\n", + "# Links dictionnary structured as follow : \n", + "# {<FROM.NODE.ID>: {<TO.NODE.ID>: {FROM.ORIENT: <FROM.ORIENT>, TO.ORIENT: <TO.ORIENT>}}}\n", + "links = {}\n", + "\n", + "# Parsing the gfa\n", + "print(f\"[gaf2aln::GFA Parser] Extracting nodes, paths and links ...\")\n", + "\n", + "def GFA_parser(gfa_lines, nodes = nodes):\n", + " _links, _nodes, _nodes_length, paths = {}, {}, {}, {}\n", + " for line in gfa_lines:\n", + " line_content = line[:-1].split(\"\\t\")\n", + " line_id = line_content[0]\n", + " \n", + " # Segment line\n", + " if line_id == \"S\" :\n", + " \n", + " _nodes_length[str(line_content[1])] = len(line_content[2])\n", + " \n", + " # Link line\n", + " elif line_id == \"L\":\n", + " try :\n", + " _links[str(line_content[1])][str(line_content[3])] = {\n", + " \"FROM\": str(line_content[2]), \n", + " \"TO\": str(line_content[4])\n", + " }\n", + "\n", + " except :\n", + " _links[str(line_content[1])] = {\n", + " str(line_content[3]) : {\"FROM.ORIENT\": str(line_content[2]), \"TO.ORIENT\": str(line_content[4])}\n", + " }\n", + "\n", + " # Path line\n", + " elif line_id == \"P\":\n", + " _paths[str(line_content[1])] = {\n", + " \"NODES\": {\n", + " str(node_id[:-1]): str(node_id[-1])\n", + " for node_id in line_content[2].split(',')\n", + " },\n", + " \"CIGAR\": line_content[3]\n", + " }\n", + "\n", + " return [_links, _nodes, _nodes_length, _paths]\n", + "\n", + "# splits = np.quantile(range(len(gfa_lines)+1), q= np.array(args.threads+1)/args.threads, method='higher').tolist()\n", + "# res = []\n", + "# for i in range(1, len(splits)):\n", + "# res.append(executor.submit(GFA_parser, gfa_lines[splits[i-1]:splits[i]]))\n", + "\n", + "# for out in res:\n", + "# results = out.result()\n", + "\n", + "# for link_id, link_info in results[0].items():\n", + "# links[]\n", + "\n", + "\n", + "for line in gfa_lines:\n", + " line_content = line[:-1].split(\"\\t\")\n", + " line_id = line_content[0]\n", + " \n", + " # Segment line\n", + " if line_id == \"S\" :\n", + " \n", + " nodes_length[str(line_content[1])] = len(line_content[2])\n", + " \n", + " # Link line\n", + " elif line_id == \"L\":\n", + " try :\n", + " links[str(line_content[1])][str(line_content[3])] = {\n", + " \"FROM\": str(line_content[2]), \n", + " \"TO\": str(line_content[4])\n", + " }\n", + "\n", + " except :\n", + " links[str(line_content[1])] = {\n", + " str(line_content[3]) : {\"FROM.ORIENT\": str(line_content[2]), \"TO.ORIENT\": str(line_content[4])}\n", + " }\n", + "\n", + " # Path line\n", + " elif line_id == \"P\":\n", + " paths[str(line_content[1])] = {\n", + " \"NODES\": {\n", + " str(node_id[:-1]): str(node_id[-1])\n", + " for node_id in line_content[2].split(',')\n", + " },\n", + " \"CIGAR\": line_content[3]\n", + " }\n", + "\n", + "del gfa_lines" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a403c88e-54ea-4a67-9047-dc44eba7f51a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[gaf2aln::Graph position processing] Computing nodes positions in each paths...\n", + "[gaf2aln::Graph position processing] Running on Capitata#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on D101#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on D134#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on G06-09-28#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on G07-DH-33#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on HDEM#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on Korso#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on M249#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on OX-heart#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on PL021#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on RC34#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T02#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T03#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T04#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T06#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T07#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T08#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T09#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T10#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T11#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T12#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T13#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T14#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T15#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T16#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T17#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T18#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T19#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T21#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T24#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T25#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T26#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T27#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on TO1000#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on W1701#1#chr03 ...\n" + ] + } + ], + "source": [ + "print(f\"[gaf2aln::Graph position processing] Computing nodes positions in each paths...\")\n", + "def get_node_pos(path_name, nodes = nodes, paths = paths, nodes_length = nodes_length):\n", + " print(f\"[gaf2aln::Graph position processing] Running on {path_name} ...\")\n", + " cur_pos = 0\n", + "\n", + " out = {}\n", + " # Iterating over nodes in the path\n", + " for path_node in paths[path_name][\"NODES\"].keys():\n", + " # Instead of checking if the node is one interesting node, we try to add to the nodes dict\n", + " if path_node in aln_nodes :\n", + " out[path_node] = {\n", + " \"START\": cur_pos, # Start position of the node start in the currrent path\n", + " \"END\": cur_pos+nodes_length[path_node], # End position of the node end in the current path\n", + " \"STRAND\": paths[path_name][\"NODES\"][path_node] # Orientation of the node in the current path\n", + " } \n", + "\n", + " cur_pos += nodes_length[path_node]+1\n", + " else :\n", + " cur_pos += nodes_length[path_node]+1\n", + "\n", + " return out\n", + "\n", + "res = {}\n", + "executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.threads)\n", + "# Adding nodes positions relative to path\n", + "for path_name in paths.keys():\n", + " res[path_name] = executor.submit(get_node_pos, path_name)\n", + "\n", + "executor.shutdown(wait=True)\n", + "\n", + "for path_name, out in res.items():\n", + " results = out.result()\n", + " for path_node, node_pos in results.items():\n", + " nodes[path_node][path_name] = node_pos\n", + "\n", + "del res" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bed36bd5-30eb-4d02-8b52-1ae5d753f8f8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[gaf2aln::Alignment position processing] Computing nodes positions in each alignement...\n", + "[gaf2aln::Alignment position processing] Running on ALN_1 ...\n", + "0 77 77 0 + 154 77\n", + "77 82 0 0 + 5 82\n", + "82 83 0 0 + 1 83\n", + "83 138 0 0 + 55 138\n", + "138 139 0 0 + 1 139\n", + "139 202 0 0 + 63 202\n", + "202 203 0 0 + 1 203\n", + "203 379 0 0 + 176 379\n", + "379 380 0 0 + 1 380\n", + "380 429 0 0 + 49 429\n", + "429 430 0 0 + 1 430\n", + "430 457 0 0 + 27 457\n", + "457 492 0 0 + 35 492\n", + "492 494 0 0 + 2 494\n", + "494 497 0 0 + 3 497\n", + "497 507 0 0 + 10 507\n", + "507 508 0 0 + 1 508\n", + "508 564 0 0 + 56 564\n", + "564 566 0 0 + 2 566\n", + "566 567 0 0 + 1 567\n", + "567 568 0 0 + 1 568\n", + "568 569 0 0 + 1 569\n", + "569 824 0 0 + 255 824\n", + "824 826 0 0 + 2 826\n", + "826 858 0 0 + 32 858\n", + "858 859 0 0 + 1 859\n", + "859 860 0 0 + 1 860\n", + "860 861 0 0 + 1 861\n", + "861 862 0 0 + 1 862\n", + "862 863 0 0 + 1 863\n", + "863 864 0 0 + 1 864\n", + "864 865 0 0 + 1 865\n", + "865 866 0 0 + 1 866\n", + "866 867 0 0 + 1 867\n", + "867 868 0 0 + 1 868\n", + "868 869 0 0 + 1 869\n", + "869 913 0 0 + 44 913\n", + "913 919 0 0 + 6 919\n", + "919 978 0 0 + 59 978\n", + "978 979 0 0 + 1 979\n", + "979 1038 0 0 + 59 1038\n", + "1038 1045 0 0 + 7 1045\n", + "1045 1046 0 0 + 1 1046\n", + "1046 1080 0 0 + 34 1080\n", + "1080 1081 0 0 + 1 1081\n", + "1081 1107 0 0 + 26 1107\n", + "1107 1108 0 0 + 1 1108\n", + "1108 1183 0 0 + 75 1183\n", + "1183 1186 0 0 + 3 1186\n", + "1186 1224 0 0 + 38 1224\n", + "1224 1257 0 0 + 33 1257\n", + "1257 1289 0 0 + 32 1289\n", + "1289 1311 0 0 + 22 1311\n", + "1311 1359 0 0 + 48 1359\n", + "1359 1382 0 0 + 23 1382\n", + "1382 1434 0 0 + 52 1434\n", + "1434 1451 0 0 + 17 1451\n", + "1451 1531 0 0 + 80 1531\n", + "1531 1532 0 0 + 1 1532\n", + "1532 1543 0 0 + 11 1543\n", + "1543 1544 0 0 + 1 1544\n", + "1544 1572 0 0 + 28 1572\n", + "1572 1573 0 0 + 1 1573\n", + "1573 1587 0 0 + 14 1587\n", + "1587 1588 0 0 + 1 1588\n", + "1588 1616 0 0 + 28 1616\n", + "1616 1617 0 0 + 1 1617\n", + "1617 1646 0 0 + 29 1646\n", + "1646 1661 0 0 + 15 1661\n", + "1661 1673 0 0 + 12 1673\n", + "1673 1674 0 0 + 1 1674\n", + "1674 1726 0 0 + 52 1726\n", + "1726 1727 0 0 + 1 1727\n", + "1727 1762 0 0 + 35 1762\n", + "1762 1763 0 0 + 1 1763\n", + "1763 1764 0 0 + 1 1764\n", + "1764 1765 0 0 + 1 1765\n", + "1765 1766 0 0 + 1 1766\n", + "1766 1767 0 0 + 1 1767\n", + "1767 1824 0 0 + 57 1824\n", + "1824 1825 0 0 + 1 1825\n", + "1825 1975 0 0 + 150 1975\n", + "1975 1976 0 0 + 1 1976\n", + "1976 2015 0 0 + 39 2015\n", + "2015 2016 0 0 + 1 2016\n", + "2016 2047 0 0 + 31 2047\n", + "2047 2055 0 0 + 8 2055\n", + "2055 2056 0 0 + 1 2056\n", + "2056 2120 0 0 + 64 2120\n", + "2120 2121 0 0 + 1 2121\n", + "2121 2157 0 0 + 36 2157\n", + "2157 2158 0 0 + 1 2158\n", + "2158 2170 0 0 + 12 2170\n", + "2170 2171 0 0 + 1 2171\n", + "2171 2205 0 0 + 34 2205\n", + "2205 2206 0 0 + 1 2206\n", + "2206 2344 0 0 + 138 2344\n", + "2344 2345 0 0 + 1 2345\n", + "2345 2364 0 0 + 19 2364\n", + "2364 2383 0 0 + 19 2383\n", + "2383 2408 0 0 + 25 2408\n", + "2408 2409 0 0 + 1 2409\n", + "2409 2441 0 0 + 32 2441\n", + "2441 2442 0 0 + 1 2442\n", + "2442 2580 0 0 + 138 2580\n", + "2580 2581 0 0 + 1 2581\n", + "2581 2582 0 0 + 1 2582\n", + "2582 2583 0 0 + 1 2583\n", + "2583 2584 0 0 + 1 2584\n", + "2584 2764 0 0 + 180 2764\n", + "2764 2765 0 0 + 1 2765\n", + "2765 2797 0 0 + 32 2797\n", + "2797 2798 0 0 + 1 2798\n", + "2798 2878 0 0 + 80 2878\n", + "2878 2879 0 0 + 1 2879\n", + "2879 2951 0 0 + 72 2951\n", + "2951 2952 0 0 + 1 2952\n", + "2952 3002 0 0 + 50 3002\n", + "3002 3077 0 0 + 75 3077\n", + "3077 3078 0 0 + 1 3078\n", + "3078 3093 0 0 + 15 3093\n", + "3093 3094 0 0 + 1 3094\n", + "3094 3097 0 0 + 3 3097\n", + "3097 3140 0 0 + 43 3140\n", + "3140 3210 0 0 + 70 3210\n", + "3210 3211 0 0 + 1 3211\n", + "3211 3229 0 0 + 18 3229\n", + "3229 3230 0 0 + 1 3230\n", + "3230 3276 0 0 + 46 3276\n", + "3276 3277 0 0 + 1 3277\n", + "3277 3315 0 0 + 38 3315\n", + "3315 3316 0 0 + 1 3316\n", + "3316 3322 0 0 + 6 3322\n", + "3322 3323 0 0 + 1 3323\n", + "3323 3348 0 0 + 25 3348\n", + "3348 3349 0 0 + 1 3349\n", + "3349 3350 0 0 + 1 3350\n", + "3350 3351 0 0 + 1 3351\n", + "3351 3352 0 0 + 1 3352\n", + "3352 3353 0 0 + 1 3353\n", + "3353 3354 0 0 + 1 3354\n", + "3354 3356 0 0 + 2 3356\n", + "3356 3357 0 0 + 1[gaf2aln::Alignment position processing] Running on ALN_2 ...\n", + " 03357 \n", + "13357 03489 0 +0 10 1+\n", + " 1132 23489 \n", + "03489 03490 +0 10 2\n", + "+2 3 10 34900\n", + " 3490+ 36421 03 \n", + "03 +4 0 1520 3642+\n", + " 36421 36444 \n", + "04 05 +0 20 3644+\n", + " 36441 5 \n", + "36855 06 00 +0 41+ 36851\n", + " 36856 \n", + "36876 07 00 +0 2+ 36871\n", + " 36877 \n", + "36937 08 00 +0 6+ 36931\n", + " 36938 \n", + "36948 09 00 +0 1+ 36941\n", + " 36949 \n", + "37089 010 00 +0 14+ 37081\n", + " 370810 \n", + "370910 011 00 +0 1+ 37091\n", + " 370911 \n", + "371011 012 00 +0 1+ 37101\n", + " 371012 \n", + "371412 013 00 +0 4+ 37141\n", + " 371413 \n", + "371513 014 00 +0 1+ 37151\n", + " 371514 \n", + "371614 015 00 +0 1+ 37161\n", + " 371615 \n", + "372015 016 00 +0 4+ 37201\n", + " 372016\n", + " 163721 170 00 0+ +1 13721 \n", + "173721\n", + " 173722 180 00 0+ +1 13722 \n", + "183722\n", + " 183735 19 00 100 ++ 231 373519\n", + "\n", + "19 20 0 0 + 1 20\n", + "20 21 0 0 + 1 21\n", + "21 22 0 0 + 1 22\n", + "22 23 0 0 + 1 23\n", + "23 24 0 0 + 1 24\n", + "24 25 0 0 + 1 25\n", + "25 26 0 0 + 1 26\n", + "26 27 0 0 + 1 27\n", + "27 28 0 0 + 1 28\n", + "28 29 0 0 + 1 29\n", + "29 30 0 0 + 1 30\n", + "30 31 0 0 + 1 31\n", + "31 32 0 0 + 1 32\n", + "32 33 0 0 + 1 33\n", + "33 34 0 0 + 1 34\n", + "34 35 0 0 + 1 35\n", + "35 36 0 0 + 1 36\n", + "36 37 0 0 + 1 37\n", + "37 38 0 0 + 1 38\n", + "38 39 0 0 + 1 39\n", + "39 40 0 0 + 1 40\n", + "40 41 0 0 + 1 41\n", + "41 42 0 0 + 1 42\n", + "42 43 0 0 + 1 43\n", + "43 44 0 0 + 1 44\n", + "44 45 0 0 + 1 45\n", + "45 46 0 0 + 1 46\n", + "46 47 0 0 + 1 47\n", + "47 48 0 0 + 1 48\n", + "48 49 0 0 + 1 49\n", + "49 50 0 0 + 1 50\n", + "50 51 0 0 + 1 51\n", + "51 52 0 0 + 1 52\n", + "52 53 0 0 + 1 53\n", + "53 54 0 0 + 1 54\n", + "54 55 0 0 + 1 55\n", + "55 56 0 0 + 1 56\n", + "56 57 0 0 + 1 57\n", + "57 58 0 0 + 1 58\n", + "58 59 0 0 + 1 59\n", + "59 60 0 0 + 1 60\n", + "60 61 0 0 + 1 61\n", + "61 62 0 0 + 1 62\n", + "62 63 0 0 + 1 63\n", + "63 64 0 0 + 1 64\n", + "64 65 0 0 + 1 65\n", + "65 66 0 0 + 1 66\n", + "66 67 0 0 + 1 67\n", + "67 68 0 0 + 1 68\n", + "68 69 0 0 + 1 69\n", + "69 70 0 0 + 1 70\n", + "70 71 0 0 + 1 71\n", + "71 72 0 0 + 1 72\n", + "72 73 0 0 + 1 73\n", + "73 74 0 0 + 1 74\n", + "74 75 0 0 + 1 75\n", + "75 76 0 0 + 1 76\n", + "76 77 0 0 + 1 77\n", + "77 78 0 0 + 1 78\n", + "78 3735 0 57489 + 61146 3735\n" + ] + } + ], + "source": [ + "print(f\"[gaf2aln::Alignment position processing] Computing nodes positions in each alignement...\")\n", + "# Adding nodes positions relative to path\n", + "\n", + "def get_aln_node_info(aln_name, aln_dict = aln_dict, nodes_length = nodes_length):\n", + " # Initializing current position in query\n", + " cur_pos = 0\n", + "\n", + " # Getting start and end node ids\n", + " start_end_id = (aln_dict[aln_name][\"PATH.MATCH\"][0][0], aln_dict[aln_name][\"PATH.MATCH\"][-1][0])\n", + "\n", + " # Creating result dictionnary\n", + " res = {}\n", + "\n", + " ## Iterating over node_ids from the given alignment\n", + " for node_id, orient in aln_dict[aln_name][\"PATH.MATCH\"]:\n", + " # Adding entry for current node\n", + " res[node_id] = {aln_name: {}}\n", + "\n", + " # First node\n", + " if node_id == start_end_id[0]:\n", + " start_pos = 0\n", + " s_off = int(aln_dict[aln_name][\"ALN.START\"])\n", + " end_pos = nodes_length[node_id]-s_off\n", + " e_off = 0\n", + " # End node\n", + " elif node_id == start_end_id[1]:\n", + " start_pos = cur_pos\n", + " s_off = 0\n", + " end_pos = int(aln_dict[aln_name][\"QRY.END\"])\n", + " e_off = nodes_length[node_id]-(end_pos-cur_pos)\n", + " # Node in between\n", + " else :\n", + " start_pos = cur_pos\n", + " s_off, e_off = 0, 0\n", + " end_pos = cur_pos+nodes_length[node_id]\n", + "\n", + " res[node_id] = {\n", + " \"START\": start_pos, # Start position on the query\n", + " \"END\": end_pos, # End position on the query\n", + " \"S.OFF\": s_off, # Offset between the start of the alignment and the node's start\n", + " \"E.OFF\": e_off, # Offset between the end of the alignment and the node's end \n", + " \"STRAND\": orient # Orientation of the node in the alignment\n", + " }\n", + " \n", + " cur_pos = end_pos\n", + " print(start_pos, end_pos, s_off, e_off, orient, nodes_length[node_id], cur_pos)\n", + "\n", + " return res\n", + "\n", + "# Storing alignement \n", + "res = {}\n", + "executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.threads)\n", + "for aln_name in aln_dict.keys():\n", + " print(f\"[gaf2aln::Alignment position processing] Running on {aln_name} ...\")\n", + " \n", + " res[aln_name] = executor.submit(get_aln_node_info, aln_name)\n", + " #res[aln_name] = get_aln_node_info(aln_name, aln_dict = aln_dict, nodes_length = nodes_length)\n", + "\n", + "executor.shutdown(wait=True)\n", + "\n", + "for aln_name, node_info in res.items():\n", + " results = node_info.result()\n", + " for node_id, info in results.items():\n", + " nodes[node_id][aln_name] = info\n", + "\n", + "del res" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4c30727c-7ffc-4852-ad81-ca2a5a7f9957", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[gaf2aln::CIGAR processing] Computing nodes cigar from alignement ...\n", + "[gaf2aln::CIGAR processing] Running on ALN_1 ...\n", + "[gaf2aln::CIGAR processing] Running on ALN_2 ...\n" + ] + } + ], + "source": [ + "# Calculating CIGAR for each nodes in each aln\n", + "print(f\"[gaf2aln::CIGAR processing] Computing nodes cigar from alignement ...\")\n", + "# Iterating over alignments\n", + "for aln in aln_dict.keys():\n", + " \n", + " print(f\"[gaf2aln::CIGAR processing] Running on {aln} ...\")\n", + " # Getting the list of base level alignement ([\"=\", \"X\", ...] from \"1=1X...\")\n", + " raw_cigar = cigar2basealn(aln_dict[aln][\"RAW.CIGAR\"])\n", + " CIGAR={}\n", + "\n", + " for node_id, orient in aln_dict[aln][\"PATH.MATCH\"]:\n", + "\n", + " _cigar = basealn2cigar(raw_cigar[\n", + " nodes[node_id][aln][\"START\"]:nodes[node_id][aln][\"END\"]\n", + " ])\n", + " nodes[node_id][aln][\"CIGAR\"] = _cigar\n", + " #print(_cigar, nodes[node_id][aln][\"START\"], nodes[node_id][aln][\"END\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e15e4762-cd71-4afe-bc74-ebe44869fee6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ALN_1\n", + "7046526 D134#1#chr03 0 77\n", + "\tIn path\n", + "\t 73306158 73306235\n", + "skipped\n", + "\n", + "7046528 D134#1#chr03 77 82\n", + "\tIn path\n", + "\t 73306238 73306243\n", + "{'Q.START': 0, 'Q.END': 77, 'T.START': 73306158, 'T.END': 73306235, 'CG': '77='}\n", + "7046530 D134#1#chr03 82 83\n", + "\tNot in path\n", + "7046531 D134#1#chr03 83 138\n", + "\tIn path\n", + "\t 73306246 73306301\n", + "{'Q.START': 77, 'Q.END': 82, 'T.START': 73306238, 'T.END': 73306243, 'CG': '5='}\n", + "7046532 D134#1#chr03 138 139\n", + "\tNot in path\n", + "7046533 D134#1#chr03 139 202\n", + "\tIn path\n", + "\t 73306302 73306365\n", + "{'Q.START': 83, 'Q.END': 138, 'T.START': 73306246, 'T.END': 73306301, 'CG': '55='}\n", + "7046534 D134#1#chr03 202 203\n", + "\tIn path\n", + "\t 73306366 73306367\n", + "{'Q.START': 139, 'Q.END': 202, 'T.START': 73306302, 'T.END': 73306365, 'CG': '63='}\n", + "7046536 D134#1#chr03 203 379\n", + "\tIn path\n", + "\t 73306368 73306544\n", + "{'Q.START': 202, 'Q.END': 203, 'T.START': 73306366, 'T.END': 73306367, 'CG': '1='}\n", + "7046537 D134#1#chr03 379 380\n", + "\tIn path\n", + "\t 73306545 73306546\n", + "{'Q.START': 203, 'Q.END': 379, 'T.START': 73306368, 'T.END': 73306544, 'CG': '176='}\n", + "7046539 D134#1#chr03 380 429\n", + "\tIn path\n", + "\t 73306547 73306596\n", + "{'Q.START': 379, 'Q.END': 380, 'T.START': 73306545, 'T.END': 73306546, 'CG': '1='}\n", + "7046541 D134#1#chr03 429 430\n", + "\tIn path\n", + "\t 73306597 73306598\n", + "{'Q.START': 380, 'Q.END': 429, 'T.START': 73306547, 'T.END': 73306596, 'CG': '49='}\n", + "7046542 D134#1#chr03 430 457\n", + "\tIn path\n", + "\t 73306599 73306626\n", + "{'Q.START': 429, 'Q.END': 430, 'T.START': 73306597, 'T.END': 73306598, 'CG': '1='}\n", + "7046544 D134#1#chr03 457 492\n", + "\tIn path\n", + "\t 73306641 73306676\n", + "{'Q.START': 430, 'Q.END': 457, 'T.START': 73306599, 'T.END': 73306626, 'CG': '27='}\n", + "7046546 D134#1#chr03 492 494\n", + "\tNot in path\n", + "7046547 D134#1#chr03 494 497\n", + "\tNot in path\n", + "7046549 D134#1#chr03 497 507\n", + "\tNot in path\n", + "7046551 D134#1#chr03 507 508\n", + "\tNot in path\n", + "7046552 D134#1#chr03 508 564\n", + "\tIn path\n", + "\t 73306694 73306750\n", + "{'Q.START': 457, 'Q.END': 492, 'T.START': 73306641, 'T.END': 73306676, 'CG': '35='}\n", + "7046554 D134#1#chr03 564 566\n", + "\tNot in path\n", + "7046556 D134#1#chr03 568 569\n", + "\tIn path\n", + "\t 73306753 73306754\n", + "{'Q.START': 508, 'Q.END': 564, 'T.START': 73306694, 'T.END': 73306750, 'CG': '56='}\n", + "7046556 D134#1#chr03 568 569\n", + "\tIn path\n", + "\t 73306753 73306754\n", + "{'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}\n", + "7046556 D134#1#chr03 568 569\n", + "\tIn path\n", + "\t 73306753 73306754\n", + "{'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}\n", + "7046557 D134#1#chr03 569 824\n", + "\tIn path\n", + "\t 73306755 73307010\n", + "{'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}\n", + "7046558 D134#1#chr03 824 826\n", + "\tNot in path\n", + "7046559 D134#1#chr03 826 858\n", + "\tIn path\n", + "\t 73307011 73307043\n", + "{'Q.START': 569, 'Q.END': 824, 'T.START': 73306755, 'T.END': 73307010, 'CG': '255='}\n", + "7046560 D134#1#chr03 858 859\n", + "\tIn path\n", + "\t 73307044 73307045\n", + "{'Q.START': 826, 'Q.END': 858, 'T.START': 73307011, 'T.END': 73307043, 'CG': '32='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 858, 'Q.END': 859, 'T.START': 73307044, 'T.END': 73307045, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046562 D134#1#chr03 869 913\n", + "\tIn path\n", + "\t 73307048 73307092\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046564 D134#1#chr03 913 919\n", + "\tIn path\n", + "\t 73307093 73307099\n", + "{'Q.START': 869, 'Q.END': 913, 'T.START': 73307048, 'T.END': 73307092, 'CG': '44='}\n", + "7046565 D134#1#chr03 919 978\n", + "\tIn path\n", + "\t 73307100 73307159\n", + "{'Q.START': 913, 'Q.END': 919, 'T.START': 73307093, 'T.END': 73307099, 'CG': '6='}\n", + "7046567 D134#1#chr03 978 979\n", + "\tIn path\n", + "\t 73307160 73307161\n", + "{'Q.START': 919, 'Q.END': 978, 'T.START': 73307100, 'T.END': 73307159, 'CG': '59='}\n", + "7046568 D134#1#chr03 979 1038\n", + "\tIn path\n", + "\t 73307162 73307221\n", + "{'Q.START': 978, 'Q.END': 979, 'T.START': 73307160, 'T.END': 73307161, 'CG': '1='}\n", + "7046570 D134#1#chr03 1038 1045\n", + "\tIn path\n", + "\t 73307224 73307231\n", + "{'Q.START': 979, 'Q.END': 1038, 'T.START': 73307162, 'T.END': 73307221, 'CG': '59='}\n", + "7046571 D134#1#chr03 1045 1046\n", + "\tIn path\n", + "\t 73307232 73307233\n", + "{'Q.START': 1038, 'Q.END': 1045, 'T.START': 73307224, 'T.END': 73307231, 'CG': '7='}\n", + "7046573 D134#1#chr03 1046 1080\n", + "\tIn path\n", + "\t 73307234 73307268\n", + "{'Q.START': 1045, 'Q.END': 1046, 'T.START': 73307232, 'T.END': 73307233, 'CG': '1='}\n", + "7046574 D134#1#chr03 1080 1081\n", + "\tIn path\n", + "\t 73307269 73307270\n", + "{'Q.START': 1046, 'Q.END': 1080, 'T.START': 73307234, 'T.END': 73307268, 'CG': '34='}\n", + "7046576 D134#1#chr03 1081 1107\n", + "\tIn path\n", + "\t 73307271 73307297\n", + "{'Q.START': 1080, 'Q.END': 1081, 'T.START': 73307269, 'T.END': 73307270, 'CG': '1='}\n", + "7046577 D134#1#chr03 1107 1108\n", + "\tNot in path\n", + "7046579 D134#1#chr03 1108 1183\n", + "\tIn path\n", + "\t 73307300 73307375\n", + "{'Q.START': 1081, 'Q.END': 1107, 'T.START': 73307271, 'T.END': 73307297, 'CG': '26='}\n", + "7046581 D134#1#chr03 1183 1186\n", + "\tIn path\n", + "\t 73307376 73307379\n", + "{'Q.START': 1108, 'Q.END': 1183, 'T.START': 73307300, 'T.END': 73307375, 'CG': '75='}\n", + "7046583 D134#1#chr03 1186 1224\n", + "\tNot in path\n", + "7046584 D134#1#chr03 1224 1257\n", + "\tIn path\n", + "\t 73307419 73307452\n", + "{'Q.START': 1183, 'Q.END': 1186, 'T.START': 73307376, 'T.END': 73307379, 'CG': '3='}\n", + "7046586 D134#1#chr03 1257 1289\n", + "\tNot in path\n", + "7046587 D134#1#chr03 1289 1311\n", + "\tIn path\n", + "\t 73307475 73307497\n", + "{'Q.START': 1224, 'Q.END': 1257, 'T.START': 73307419, 'T.END': 73307452, 'CG': '33='}\n", + "7046589 D134#1#chr03 1311 1359\n", + "\tNot in path\n", + "7046590 D134#1#chr03 1359 1382\n", + "\tIn path\n", + "\t 73307546 73307569\n", + "{'Q.START': 1289, 'Q.END': 1311, 'T.START': 73307475, 'T.END': 73307497, 'CG': '22='}\n", + "7046592 D134#1#chr03 1382 1434\n", + "\tNot in path\n", + "7046593 D134#1#chr03 1434 1451\n", + "\tIn path\n", + "\t 73307643 73307660\n", + "{'Q.START': 1359, 'Q.END': 1382, 'T.START': 73307546, 'T.END': 73307569, 'CG': '23='}\n", + "7046594 D134#1#chr03 1451 1531\n", + "\tIn path\n", + "\t 73307661 73307741\n", + "{'Q.START': 1434, 'Q.END': 1451, 'T.START': 73307643, 'T.END': 73307660, 'CG': '17='}\n", + "7046596 D134#1#chr03 1531 1532\n", + "\tNot in path\n", + "7046597 D134#1#chr03 1532 1543\n", + "\tIn path\n", + "\t 73307744 73307755\n", + "{'Q.START': 1451, 'Q.END': 1531, 'T.START': 73307661, 'T.END': 73307741, 'CG': '80='}\n", + "7046599 D134#1#chr03 1543 1544\n", + "\tNot in path\n", + "7046600 D134#1#chr03 1544 1572\n", + "\tIn path\n", + "\t 73307758 73307786\n", + "{'Q.START': 1532, 'Q.END': 1543, 'T.START': 73307744, 'T.END': 73307755, 'CG': '11='}\n", + "7046601 D134#1#chr03 1572 1573\n", + "\tIn path\n", + "\t 73307787 73307788\n", + "{'Q.START': 1544, 'Q.END': 1572, 'T.START': 73307758, 'T.END': 73307786, 'CG': '28='}\n", + "7046603 D134#1#chr03 1573 1587\n", + "\tIn path\n", + "\t 73307789 73307803\n", + "{'Q.START': 1572, 'Q.END': 1573, 'T.START': 73307787, 'T.END': 73307788, 'CG': '1='}\n", + "7046604 D134#1#chr03 1587 1588\n", + "\tNot in path\n", + "7046606 D134#1#chr03 1588 1616\n", + "\tIn path\n", + "\t 73307806 73307834\n", + "{'Q.START': 1573, 'Q.END': 1587, 'T.START': 73307789, 'T.END': 73307803, 'CG': '14='}\n", + "7046608 D134#1#chr03 1616 1617\n", + "\tIn path\n", + "\t 73307835 73307836\n", + "{'Q.START': 1588, 'Q.END': 1616, 'T.START': 73307806, 'T.END': 73307834, 'CG': '28='}\n", + "7046609 D134#1#chr03 1617 1646\n", + "\tIn path\n", + "\t 73307837 73307866\n", + "{'Q.START': 1616, 'Q.END': 1617, 'T.START': 73307835, 'T.END': 73307836, 'CG': '1='}\n", + "7046621 D134#1#chr03 1646 1661\n", + "\tIn path\n", + "\t 73307867 73307882\n", + "{'Q.START': 1617, 'Q.END': 1646, 'T.START': 73307837, 'T.END': 73307866, 'CG': '29='}\n", + "7046622 D134#1#chr03 1661 1673\n", + "\tIn path\n", + "\t 73307883 73307895\n", + "{'Q.START': 1646, 'Q.END': 1661, 'T.START': 73307867, 'T.END': 73307882, 'CG': '15='}\n", + "7046624 D134#1#chr03 1673 1674\n", + "\tIn path\n", + "\t 73307896 73307897\n", + "{'Q.START': 1661, 'Q.END': 1673, 'T.START': 73307883, 'T.END': 73307895, 'CG': '12='}\n", + "7046625 D134#1#chr03 1674 1726\n", + "\tIn path\n", + "\t 73307898 73307950\n", + "{'Q.START': 1673, 'Q.END': 1674, 'T.START': 73307896, 'T.END': 73307897, 'CG': '1='}\n", + "7046626 D134#1#chr03 1726 1727\n", + "\tNot in path\n", + "7046628 D134#1#chr03 1727 1762\n", + "\tIn path\n", + "\t 73307953 73307988\n", + "{'Q.START': 1674, 'Q.END': 1726, 'T.START': 73307898, 'T.END': 73307950, 'CG': '52='}\n", + "7046631 D134#1#chr03 1766 1767\n", + "\tIn path\n", + "\t 73307991 73307992\n", + "{'Q.START': 1727, 'Q.END': 1762, 'T.START': 73307953, 'T.END': 73307988, 'CG': '35='}\n", + "7046673 D134#1#chr03 1765 1766\n", + "\tIn path\n", + "\t 73307993 73307994\n", + "{'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}\n", + "7046631 D134#1#chr03 1766 1767\n", + "\tIn path\n", + "\t 73307991 73307992\n", + "{'Q.START': 1765, 'Q.END': 1766, 'T.START': 73307993, 'T.END': 73307994, 'CG': '1='}\n", + "7046673 D134#1#chr03 1765 1766\n", + "\tIn path\n", + "\t 73307993 73307994\n", + "{'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}\n", + "7046631 D134#1#chr03 1766 1767\n", + "\tIn path\n", + "\t 73307991 73307992\n", + "{'Q.START': 1765, 'Q.END': 1766, 'T.START': 73307993, 'T.END': 73307994, 'CG': '1='}\n", + "7046632 D134#1#chr03 1767 1824\n", + "\tIn path\n", + "\t 73307995 73308052\n", + "{'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}\n", + "7046634 D134#1#chr03 1824 1825\n", + "\tIn path\n", + "\t 73308053 73308054\n", + "{'Q.START': 1767, 'Q.END': 1824, 'T.START': 73307995, 'T.END': 73308052, 'CG': '57='}\n", + "7046635 D134#1#chr03 1825 1975\n", + "\tIn path\n", + "\t 73308055 73308205\n", + "{'Q.START': 1824, 'Q.END': 1825, 'T.START': 73308053, 'T.END': 73308054, 'CG': '1='}\n", + "7046637 D134#1#chr03 1975 1976\n", + "\tNot in path\n", + "7046638 D134#1#chr03 1976 2015\n", + "\tIn path\n", + "\t 73308208 73308247\n", + "{'Q.START': 1825, 'Q.END': 1975, 'T.START': 73308055, 'T.END': 73308205, 'CG': '150='}\n", + "7046639 D134#1#chr03 2015 2016\n", + "\tNot in path\n", + "7046641 D134#1#chr03 2016 2047\n", + "\tIn path\n", + "\t 73308250 73308281\n", + "{'Q.START': 1976, 'Q.END': 2015, 'T.START': 73308208, 'T.END': 73308247, 'CG': '39='}\n", + "7046644 D134#1#chr03 2047 2055\n", + "\tIn path\n", + "\t 73308286 73308294\n", + "{'Q.START': 2016, 'Q.END': 2047, 'T.START': 73308250, 'T.END': 73308281, 'CG': '31='}\n", + "7046646 D134#1#chr03 2055 2056\n", + "\tNot in path\n", + "7046647 D134#1#chr03 2056 2120\n", + "\tIn path\n", + "\t 73308297 73308361\n", + "{'Q.START': 2047, 'Q.END': 2055, 'T.START': 73308286, 'T.END': 73308294, 'CG': '8='}\n", + "7046649 D134#1#chr03 2120 2121\n", + "\tIn path\n", + "\t 73308362 73308363\n", + "{'Q.START': 2056, 'Q.END': 2120, 'T.START': 73308297, 'T.END': 73308361, 'CG': '64='}\n", + "7046650 D134#1#chr03 2121 2157\n", + "\tIn path\n", + "\t 73308364 73308400\n", + "{'Q.START': 2120, 'Q.END': 2121, 'T.START': 73308362, 'T.END': 73308363, 'CG': '1='}\n", + "7046652 D134#1#chr03 2157 2158\n", + "\tNot in path\n", + "7046653 D134#1#chr03 2158 2170\n", + "\tIn path\n", + "\t 73308403 73308415\n", + "{'Q.START': 2121, 'Q.END': 2157, 'T.START': 73308364, 'T.END': 73308400, 'CG': '36='}\n", + "7046654 D134#1#chr03 2170 2171\n", + "\tIn path\n", + "\t 73308416 73308417\n", + "{'Q.START': 2158, 'Q.END': 2170, 'T.START': 73308403, 'T.END': 73308415, 'CG': '12='}\n", + "7046656 D134#1#chr03 2171 2205\n", + "\tIn path\n", + "\t 73308418 73308452\n", + "{'Q.START': 2170, 'Q.END': 2171, 'T.START': 73308416, 'T.END': 73308417, 'CG': '1='}\n", + "7046657 D134#1#chr03 2205 2206\n", + "\tNot in path\n", + "7046659 D134#1#chr03 2206 2344\n", + "\tIn path\n", + "\t 73308455 73308593\n", + "{'Q.START': 2171, 'Q.END': 2205, 'T.START': 73308418, 'T.END': 73308452, 'CG': '34='}\n", + "7046660 D134#1#chr03 2344 2345\n", + "\tNot in path\n", + "7046662 D134#1#chr03 2345 2364\n", + "\tIn path\n", + "\t 73308596 73308615\n", + "{'Q.START': 2206, 'Q.END': 2344, 'T.START': 73308455, 'T.END': 73308593, 'CG': '138='}\n", + "7046663 D134#1#chr03 2364 2383\n", + "\tIn path\n", + "\t 73308616 73308635\n", + "{'Q.START': 2345, 'Q.END': 2364, 'T.START': 73308596, 'T.END': 73308615, 'CG': '19='}\n", + "7046665 D134#1#chr03 2383 2408\n", + "\tIn path\n", + "\t 73308636 73308661\n", + "{'Q.START': 2364, 'Q.END': 2383, 'T.START': 73308616, 'T.END': 73308635, 'CG': '19='}\n", + "7046667 D134#1#chr03 2408 2409\n", + "\tIn path\n", + "\t 73308662 73308663\n", + "{'Q.START': 2383, 'Q.END': 2408, 'T.START': 73308636, 'T.END': 73308661, 'CG': '25='}\n", + "7046668 D134#1#chr03 2409 2441\n", + "\tIn path\n", + "\t 73308664 73308696\n", + "{'Q.START': 2408, 'Q.END': 2409, 'T.START': 73308662, 'T.END': 73308663, 'CG': '1='}\n", + "7046670 D134#1#chr03 2441 2442\n", + "\tIn path\n", + "\t 73308697 73308698\n", + "{'Q.START': 2409, 'Q.END': 2441, 'T.START': 73308664, 'T.END': 73308696, 'CG': '32='}\n", + "7046671 D134#1#chr03 2442 2580\n", + "\tIn path\n", + "\t 73308699 73308837\n", + "{'Q.START': 2441, 'Q.END': 2442, 'T.START': 73308697, 'T.END': 73308698, 'CG': '1='}\n", + "7046674 D134#1#chr03 2582 2583\n", + "\tIn path\n", + "\t 73308838 73308839\n", + "{'Q.START': 2442, 'Q.END': 2580, 'T.START': 73308699, 'T.END': 73308837, 'CG': '138='}\n", + "7046675 D134#1#chr03 2583 2584\n", + "\tIn path\n", + "\t 73308840 73308841\n", + "{'Q.START': 2582, 'Q.END': 2583, 'T.START': 73308838, 'T.END': 73308839, 'CG': '1='}\n", + "7046674 D134#1#chr03 2582 2583\n", + "\tIn path\n", + "\t 73308838 73308839\n", + "{'Q.START': 2583, 'Q.END': 2584, 'T.START': 73308840, 'T.END': 73308841, 'CG': '1='}\n", + "7046675 D134#1#chr03 2583 2584\n", + "\tIn path\n", + "\t 73308840 73308841\n", + "{'Q.START': 2582, 'Q.END': 2583, 'T.START': 73308838, 'T.END': 73308839, 'CG': '1='}\n", + "7046676 D134#1#chr03 2584 2764\n", + "\tIn path\n", + "\t 73308842 73309022\n", + "{'Q.START': 2583, 'Q.END': 2584, 'T.START': 73308840, 'T.END': 73308841, 'CG': '1='}\n", + "7046678 D134#1#chr03 2764 2765\n", + "\tNot in path\n", + "7046679 D134#1#chr03 2765 2797\n", + "\tIn path\n", + "\t 73309025 73309057\n", + "{'Q.START': 2584, 'Q.END': 2764, 'T.START': 73308842, 'T.END': 73309022, 'CG': '180='}\n", + "7046680 D134#1#chr03 2797 2798\n", + "\tNot in path\n", + "7046682 D134#1#chr03 2798 2878\n", + "\tIn path\n", + "\t 73309060 73309140\n", + "{'Q.START': 2765, 'Q.END': 2797, 'T.START': 73309025, 'T.END': 73309057, 'CG': '32='}\n", + "7046684 D134#1#chr03 2878 2879\n", + "\tIn path\n", + "\t 73309141 73309142\n", + "{'Q.START': 2798, 'Q.END': 2878, 'T.START': 73309060, 'T.END': 73309140, 'CG': '80='}\n", + "7046685 D134#1#chr03 2879 2951\n", + "\tIn path\n", + "\t 73309143 73309215\n", + "{'Q.START': 2878, 'Q.END': 2879, 'T.START': 73309141, 'T.END': 73309142, 'CG': '1='}\n", + "7046686 D134#1#chr03 2951 2952\n", + "\tIn path\n", + "\t 73309216 73309217\n", + "{'Q.START': 2879, 'Q.END': 2951, 'T.START': 73309143, 'T.END': 73309215, 'CG': '72='}\n", + "7046688 D134#1#chr03 2952 3002\n", + "\tIn path\n", + "\t 73309218 73309268\n", + "{'Q.START': 2951, 'Q.END': 2952, 'T.START': 73309216, 'T.END': 73309217, 'CG': '1='}\n", + "7046690 D134#1#chr03 3002 3077\n", + "\tIn path\n", + "\t 73309271 73309346\n", + "{'Q.START': 2952, 'Q.END': 3002, 'T.START': 73309218, 'T.END': 73309268, 'CG': '50='}\n", + "7046692 D134#1#chr03 3077 3078\n", + "\tIn path\n", + "\t 73309347 73309348\n", + "{'Q.START': 3002, 'Q.END': 3077, 'T.START': 73309271, 'T.END': 73309346, 'CG': '75='}\n", + "7046693 D134#1#chr03 3078 3093\n", + "\tIn path\n", + "\t 73309349 73309364\n", + "{'Q.START': 3077, 'Q.END': 3078, 'T.START': 73309347, 'T.END': 73309348, 'CG': '1='}\n", + "7046695 D134#1#chr03 3093 3094\n", + "\tNot in path\n", + "7046696 D134#1#chr03 3094 3097\n", + "\tIn path\n", + "\t 73309367 73309370\n", + "{'Q.START': 3078, 'Q.END': 3093, 'T.START': 73309349, 'T.END': 73309364, 'CG': '15='}\n", + "7046698 D134#1#chr03 3097 3140\n", + "\tIn path\n", + "\t 73309371 73309414\n", + "{'Q.START': 3094, 'Q.END': 3097, 'T.START': 73309367, 'T.END': 73309370, 'CG': '3='}\n", + "7046700 D134#1#chr03 3140 3210\n", + "\tIn path\n", + "\t 73309415 73309485\n", + "{'Q.START': 3097, 'Q.END': 3140, 'T.START': 73309371, 'T.END': 73309414, 'CG': '43='}\n", + "7046702 D134#1#chr03 3210 3211\n", + "\tIn path\n", + "\t 73309486 73309487\n", + "{'Q.START': 3140, 'Q.END': 3210, 'T.START': 73309415, 'T.END': 73309485, 'CG': '70='}\n", + "7046703 D134#1#chr03 3211 3229\n", + "\tIn path\n", + "\t 73309488 73309506\n", + "{'Q.START': 3210, 'Q.END': 3211, 'T.START': 73309486, 'T.END': 73309487, 'CG': '1='}\n", + "7046704 D134#1#chr03 3229 3230\n", + "\tIn path\n", + "\t 73309507 73309508\n", + "{'Q.START': 3211, 'Q.END': 3229, 'T.START': 73309488, 'T.END': 73309506, 'CG': '18='}\n", + "7046706 D134#1#chr03 3230 3276\n", + "\tIn path\n", + "\t 73309509 73309555\n", + "{'Q.START': 3229, 'Q.END': 3230, 'T.START': 73309507, 'T.END': 73309508, 'CG': '1='}\n", + "7046707 D134#1#chr03 3276 3277\n", + "\tNot in path\n", + "7046709 D134#1#chr03 3277 3315\n", + "\tIn path\n", + "\t 73309558 73309596\n", + "{'Q.START': 3230, 'Q.END': 3276, 'T.START': 73309509, 'T.END': 73309555, 'CG': '46='}\n", + "7046710 D134#1#chr03 3315 3316\n", + "\tNot in path\n", + "7046712 D134#1#chr03 3316 3322\n", + "\tIn path\n", + "\t 73309599 73309605\n", + "{'Q.START': 3277, 'Q.END': 3315, 'T.START': 73309558, 'T.END': 73309596, 'CG': '38='}\n", + "7046713 D134#1#chr03 3322 3323\n", + "\tNot in path\n", + "7046715 D134#1#chr03 3323 3348\n", + "\tIn path\n", + "\t 73309608 73309633\n", + "{'Q.START': 3316, 'Q.END': 3322, 'T.START': 73309599, 'T.END': 73309605, 'CG': '6='}\n", + "7046718 D134#1#chr03 3352 3353\n", + "\tIn path\n", + "\t 73309634 73309635\n", + "{'Q.START': 3323, 'Q.END': 3348, 'T.START': 73309608, 'T.END': 73309633, 'CG': '25='}\n", + "7046717 D134#1#chr03 3351 3352\n", + "\tIn path\n", + "\t 73309636 73309637\n", + "{'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}\n", + "7046718 D134#1#chr03 3352 3353\n", + "\tIn path\n", + "\t 73309634 73309635\n", + "{'Q.START': 3351, 'Q.END': 3352, 'T.START': 73309636, 'T.END': 73309637, 'CG': '1='}\n", + "7046717 D134#1#chr03 3351 3352\n", + "\tIn path\n", + "\t 73309636 73309637\n", + "{'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}\n", + "7046718 D134#1#chr03 3352 3353\n", + "\tIn path\n", + "\t 73309634 73309635\n", + "{'Q.START': 3351, 'Q.END': 3352, 'T.START': 73309636, 'T.END': 73309637, 'CG': '1='}\n", + "7046720 D134#1#chr03 3353 3354\n", + "\tIn path\n", + "\t 73309638 73309639\n", + "{'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}\n", + "7046722 D134#1#chr03 3354 3356\n", + "\tIn path\n", + "\t 73309640 73309642\n", + "{'Q.START': 3353, 'Q.END': 3354, 'T.START': 73309638, 'T.END': 73309639, 'CG': '1='}\n", + "7046724 D134#1#chr03 3356 3357\n", + "\tNot in path\n", + "7046725 D134#1#chr03 3357 3489\n", + "\tIn path\n", + "\t 73309645 73309777\n", + "{'Q.START': 3354, 'Q.END': 3356, 'T.START': 73309640, 'T.END': 73309642, 'CG': '2='}\n", + "7046727 D134#1#chr03 3489 3490\n", + "\tNot in path\n", + "7046728 D134#1#chr03 3490 3642\n", + "\tIn path\n", + "\t 73309780 73309932\n", + "{'Q.START': 3357, 'Q.END': 3489, 'T.START': 73309645, 'T.END': 73309777, 'CG': '132='}\n", + "7046729 D134#1#chr03 3642 3644\n", + "\tNot in path\n", + "7046730 D134#1#chr03 3644 3685\n", + "\tIn path\n", + "\t 73309933 73309974\n", + "{'Q.START': 3490, 'Q.END': 3642, 'T.START': 73309780, 'T.END': 73309932, 'CG': '152='}\n", + "7046731 D134#1#chr03 3685 3687\n", + "\tNot in path\n", + "7046733 D134#1#chr03 3687 3693\n", + "\tIn path\n", + "\t 73309977 73309983\n", + "{'Q.START': 3644, 'Q.END': 3685, 'T.START': 73309933, 'T.END': 73309974, 'CG': '41='}\n", + "7046735 D134#1#chr03 3693 3694\n", + "\tNot in path\n", + "7046736 D134#1#chr03 3694 3708\n", + "\tIn path\n", + "\t 73309986 73310000\n", + "{'Q.START': 3687, 'Q.END': 3693, 'T.START': 73309977, 'T.END': 73309983, 'CG': '6='}\n", + "7046738 D134#1#chr03 3720 3721\n", + "\tIn path\n", + "\t 73310010 73310011\n", + "{'Q.START': 3694, 'Q.END': 3708, 'T.START': 73309986, 'T.END': 73310000, 'CG': '14='}\n", + "7046739 D134#1#chr03 3721 3722\n", + "\tIn path\n", + "\t 73310003 73310004\n", + "{'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}\n", + "7046740 D134#1#chr03 3716 3720\n", + "\tIn path\n", + "\t 73310005 73310009\n", + "{'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}\n", + "7046738 D134#1#chr03 3720 3721\n", + "\tIn path\n", + "\t 73310010 73310011\n", + "{'Q.START': 3716, 'Q.END': 3720, 'T.START': 73310005, 'T.END': 73310009, 'CG': '4='}\n", + "7046739 D134#1#chr03 3721 3722\n", + "\tIn path\n", + "\t 73310003 73310004\n", + "{'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}\n", + "7046740 D134#1#chr03 3716 3720\n", + "\tIn path\n", + "\t 73310005 73310009\n", + "{'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}\n", + "7046738 D134#1#chr03 3720 3721\n", + "\tIn path\n", + "\t 73310010 73310011\n", + "{'Q.START': 3716, 'Q.END': 3720, 'T.START': 73310005, 'T.END': 73310009, 'CG': '4='}\n", + "7046739 D134#1#chr03 3721 3722\n", + "\tIn path\n", + "\t 73310003 73310004\n", + "{'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}\n", + "7046741 D134#1#chr03 3722 3735\n", + "\tIn path\n", + "\t 73310012 73310045\n", + "{'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}\n", + "ALN_2\n", + "7594382 D134#1#chr03 0 1\n", + "\tIn path\n", + "\t 70220037 70220038\n", + "skipped\n", + "\n", + "7594369 D134#1#chr03 32 33\n", + "\tIn path\n", + "\t 70219216 70219217\n", + "{'Q.START': 0, 'Q.END': 1, 'T.START': 70220037, 'T.END': 70220038, 'CG': '1='}\n", + "7594371 D134#1#chr03 15 16\n", + "\tIn path\n", + "\t 70221163 70221164\n", + "{'Q.START': 32, 'Q.END': 33, 'T.START': 70219216, 'T.END': 70219217, 'CG': '1='}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 15, 'Q.END': 16, 'T.START': 70221163, 'T.END': 70221164, 'CG': '1='}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594356 D134#1#chr03 66 67\n", + "\tIn path\n", + "\t 70219570 70219571\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 66, 'Q.END': 67, 'T.START': 70219570, 'T.END': 70219571, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594375 D134#1#chr03 68 69\n", + "\tIn path\n", + "\t 70221598 70221599\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594626 D134#1#chr03 10 11\n", + "\tIn path\n", + "\t 70219214 70219215\n", + "{'Q.START': 68, 'Q.END': 69, 'T.START': 70221598, 'T.END': 70221599, 'CG': '1='}\n", + "7594011 D134#1#chr03 11 12\n", + "\tIn path\n", + "\t 70219995 70219996\n", + "{'Q.START': 10, 'Q.END': 11, 'T.START': 70219214, 'T.END': 70219215, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 11, 'Q.END': 12, 'T.START': 70219995, 'T.END': 70219996, 'CG': '1='}\n", + "7594375 D134#1#chr03 68 69\n", + "\tIn path\n", + "\t 70221598 70221599\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594369 D134#1#chr03 32 33\n", + "\tIn path\n", + "\t 70219216 70219217\n", + "{'Q.START': 68, 'Q.END': 69, 'T.START': 70221598, 'T.END': 70221599, 'CG': '1='}\n", + "7594371 D134#1#chr03 15 16\n", + "\tIn path\n", + "\t 70221163 70221164\n", + "{'Q.START': 32, 'Q.END': 33, 'T.START': 70219216, 'T.END': 70219217, 'CG': '1='}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 15, 'Q.END': 16, 'T.START': 70221163, 'T.END': 70221164, 'CG': '1='}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594241 D134#1#chr03 20 21\n", + "\tIn path\n", + "\t 70219220 70219221\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594248 D134#1#chr03 21 22\n", + "\tNot in path\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 20, 'Q.END': 21, 'T.START': 70219220, 'T.END': 70219221, 'CG': '1='}\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594315 D134#1#chr03 53 54\n", + "\tIn path\n", + "\t 70219857 70219858\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 53, 'Q.END': 54, 'T.START': 70219857, 'T.END': 70219858, 'CG': '1='}\n", + "7594330 D134#1#chr03 26 27\n", + "\tNot in path\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594315 D134#1#chr03 53 54\n", + "\tIn path\n", + "\t 70219857 70219858\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 53, 'Q.END': 54, 'T.START': 70219857, 'T.END': 70219858, 'CG': '1='}\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594369 D134#1#chr03 32 33\n", + "\tIn path\n", + "\t 70219216 70219217\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 32, 'Q.END': 33, 'T.START': 70219216, 'T.END': 70219217, 'CG': '1='}\n", + "7594026 D134#1#chr03 37 38\n", + "\tIn path\n", + "\t 70220249 70220250\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 37, 'Q.END': 38, 'T.START': 70220249, 'T.END': 70220250, 'CG': '1='}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594026 D134#1#chr03 37 38\n", + "\tIn path\n", + "\t 70220249 70220250\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 37, 'Q.END': 38, 'T.START': 70220249, 'T.END': 70220250, 'CG': '1='}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594315 D134#1#chr03 53 54\n", + "\tIn path\n", + "\t 70219857 70219858\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 53, 'Q.END': 54, 'T.START': 70219857, 'T.END': 70219858, 'CG': '1='}\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594356 D134#1#chr03 66 67\n", + "\tIn path\n", + "\t 70219570 70219571\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 66, 'Q.END': 67, 'T.START': 70219570, 'T.END': 70219571, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594375 D134#1#chr03 68 69\n", + "\tIn path\n", + "\t 70221598 70221599\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 68, 'Q.END': 69, 'T.START': 70221598, 'T.END': 70221599, 'CG': '1='}\n", + "7594356 D134#1#chr03 66 67\n", + "\tIn path\n", + "\t 70219570 70219571\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 66, 'Q.END': 67, 'T.START': 70219570, 'T.END': 70219571, 'CG': '1='}\n", + "7594375 D134#1#chr03 68 69\n", + "\tIn path\n", + "\t 70221598 70221599\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 68, 'Q.END': 69, 'T.START': 70221598, 'T.END': 70221599, 'CG': '1='}\n", + "7594350 D134#1#chr03 70 71\n", + "\tIn path\n", + "\t 70219226 70219227\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594264 D134#1#chr03 71 72\n", + "\tIn path\n", + "\t 70219228 70219229\n", + "{'Q.START': 70, 'Q.END': 71, 'T.START': 70219226, 'T.END': 70219227, 'CG': '1='}\n", + "7594207 D134#1#chr03 72 73\n", + "\tIn path\n", + "\t 70219230 70219231\n", + "{'Q.START': 71, 'Q.END': 72, 'T.START': 70219228, 'T.END': 70219229, 'CG': '1='}\n", + "7594225 D134#1#chr03 73 74\n", + "\tIn path\n", + "\t 70219232 70219233\n", + "{'Q.START': 72, 'Q.END': 73, 'T.START': 70219230, 'T.END': 70219231, 'CG': '1='}\n", + "7594227 D134#1#chr03 74 75\n", + "\tIn path\n", + "\t 70220150 70220151\n", + "{'Q.START': 73, 'Q.END': 74, 'T.START': 70219232, 'T.END': 70219233, 'CG': '1='}\n", + "7594120 D134#1#chr03 75 76\n", + "\tIn path\n", + "\t 70219236 70219237\n", + "{'Q.START': 74, 'Q.END': 75, 'T.START': 70220150, 'T.END': 70220151, 'CG': '1='}\n", + "7594132 D134#1#chr03 76 77\n", + "\tIn path\n", + "\t 70219777 70219778\n", + "{'Q.START': 75, 'Q.END': 76, 'T.START': 70219236, 'T.END': 70219237, 'CG': '1='}\n", + "7594165 D134#1#chr03 77 78\n", + "\tIn path\n", + "\t 70219240 70219241\n", + "{'Q.START': 76, 'Q.END': 77, 'T.START': 70219777, 'T.END': 70219778, 'CG': '1='}\n", + "7594172 D134#1#chr03 78 3735\n", + "\tNot in path\n" + ] + } + ], + "source": [ + "ALNS = {}\n", + "## Iterating over alignments\n", + "for aln_name in aln_dict.keys():\n", + " \n", + " ## Iterating over paths of the gfa\n", + " for path_name in paths.keys():\n", + " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(aln_name)\n", + " _ = [] # Temporary list holding alignment blocks\n", + "\n", + " ## Iterating over alignment nodes of the current alignment\n", + " for node_id, orient in aln_dict[aln_name][\"PATH.MATCH\"]:\n", + "\n", + " # Getting node info\n", + " n_info = nodes[node_id]\n", + " q_start = n_info[aln_name][\"START\"] # Start position on the query\n", + " q_end = n_info[aln_name][\"END\"] # End position on the query\n", + " _CG = n_info[aln_name][\"CIGAR\"] # Cigar of the alignment on the current node\n", + "\n", + " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(node_id, path_name, q_start, q_end)\n", + "\n", + " ## Checking if path is traversing the current node\n", + " if path_name in list(n_info.keys()):\n", + " if path_name == \"D134#1#chr03\": print(\"\\tIn path\")\n", + "\n", + " ## Getting start and end position on the target given the orientation of the node in the alignment and the path\n", + " if n_info[aln_name][\"STRAND\"] == n_info[path_name][\"STRAND\"] :\n", + " t_start = n_info[path_name][\"START\"]+n_info[aln_name][\"S.OFF\"]\n", + " t_end = n_info[path_name][\"END\"]+n_info[aln_name][\"E.OFF\"] \n", + " else :\n", + " t_end = n_info[path_name][\"START\"]+n_info[aln_name][\"S.OFF\"]\n", + " t_start = n_info[path_name][\"END\"]+n_info[aln_name][\"E.OFF\"]\n", + "\n", + " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(\"\\t\", t_start, t_end)\n", + "\n", + " \"\"\"\n", + " If the latest block t.end and q.end matches with the current node t.start and q.start, \n", + " the node should be added to the block. Else, we terminate the block and add the node to a new block\n", + " \"\"\"\n", + " \n", + " # Non empty temporary list of aln and ending of the last block is the same as the start of the new node : \n", + " if len(_) and _[-1][\"T.END\"] == t_start and _[-1][\"Q.END\"]+1 == q_start: \n", + " tmp_aln[\"Q.END\"] = q_end\n", + " tmp_aln[\"T.END\"] = t_end\n", + " tmp_aln[\"CG\"] += _CG\n", + "# elif len(_) and _[-1][\"T.END\"] == t_start: # Following on the target not on the query (i.e. Insertion)\n", + "# tmp_aln[\"T.END\"] = t_end\n", + "# tmp_aln[\"CG\"] += f\"{nodes_length[node_id]}I\"\n", + "# elif len(_) and _[-1][\"Q.END\"]+1 == q_start: # Following on the query, not on the target (i.e. Deletion)\n", + "# tmp_aln[\"Q.END\"] = q_end\n", + "# tmp_aln[\"CG\"] += f\"{nodes_length[node_id]}D\"\n", + " else : # Else, completely different\n", + " try : \n", + " _.append(tmp_aln)\n", + " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(tmp_aln)\n", + " except : \n", + " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(\"skipped\\n\")\n", + " tmp_aln = {\n", + " \"Q.START\": q_start,\n", + " \"Q.END\": q_end,\n", + " \"T.START\": t_start,\n", + " \"T.END\": t_end,\n", + " \"CG\": _CG,\n", + " }\n", + " \n", + " else : \n", + " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(\"\\tNot in path\")\n", + " # Node is not in the path\n", + "\n", + " del tmp_aln\n", + " \n", + " ALNS[(path_name, aln_name)] = _" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "547f03fa-cbd5-42f9-b668-1ca4404795ba", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'Q.START': 0, 'Q.END': 77, 'T.START': 73306158, 'T.END': 73306235, 'CG': '77='}, {'Q.START': 77, 'Q.END': 82, 'T.START': 73306238, 'T.END': 73306243, 'CG': '5='}, {'Q.START': 83, 'Q.END': 138, 'T.START': 73306246, 'T.END': 73306301, 'CG': '55='}, {'Q.START': 139, 'Q.END': 202, 'T.START': 73306302, 'T.END': 73306365, 'CG': '63='}, {'Q.START': 202, 'Q.END': 203, 'T.START': 73306366, 'T.END': 73306367, 'CG': '1='}, {'Q.START': 203, 'Q.END': 379, 'T.START': 73306368, 'T.END': 73306544, 'CG': '176='}, {'Q.START': 379, 'Q.END': 380, 'T.START': 73306545, 'T.END': 73306546, 'CG': '1='}, {'Q.START': 380, 'Q.END': 429, 'T.START': 73306547, 'T.END': 73306596, 'CG': '49='}, {'Q.START': 429, 'Q.END': 430, 'T.START': 73306597, 'T.END': 73306598, 'CG': '1='}, {'Q.START': 430, 'Q.END': 457, 'T.START': 73306599, 'T.END': 73306626, 'CG': '27='}, {'Q.START': 457, 'Q.END': 492, 'T.START': 73306641, 'T.END': 73306676, 'CG': '35='}, {'Q.START': 508, 'Q.END': 564, 'T.START': 73306694, 'T.END': 73306750, 'CG': '56='}, {'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}, {'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}, {'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}, {'Q.START': 569, 'Q.END': 824, 'T.START': 73306755, 'T.END': 73307010, 'CG': '255='}, {'Q.START': 826, 'Q.END': 858, 'T.START': 73307011, 'T.END': 73307043, 'CG': '32='}, {'Q.START': 858, 'Q.END': 859, 'T.START': 73307044, 'T.END': 73307045, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 869, 'Q.END': 913, 'T.START': 73307048, 'T.END': 73307092, 'CG': '44='}, {'Q.START': 913, 'Q.END': 919, 'T.START': 73307093, 'T.END': 73307099, 'CG': '6='}, {'Q.START': 919, 'Q.END': 978, 'T.START': 73307100, 'T.END': 73307159, 'CG': '59='}, {'Q.START': 978, 'Q.END': 979, 'T.START': 73307160, 'T.END': 73307161, 'CG': '1='}, {'Q.START': 979, 'Q.END': 1038, 'T.START': 73307162, 'T.END': 73307221, 'CG': '59='}, {'Q.START': 1038, 'Q.END': 1045, 'T.START': 73307224, 'T.END': 73307231, 'CG': '7='}, {'Q.START': 1045, 'Q.END': 1046, 'T.START': 73307232, 'T.END': 73307233, 'CG': '1='}, {'Q.START': 1046, 'Q.END': 1080, 'T.START': 73307234, 'T.END': 73307268, 'CG': '34='}, {'Q.START': 1080, 'Q.END': 1081, 'T.START': 73307269, 'T.END': 73307270, 'CG': '1='}, {'Q.START': 1081, 'Q.END': 1107, 'T.START': 73307271, 'T.END': 73307297, 'CG': '26='}, {'Q.START': 1108, 'Q.END': 1183, 'T.START': 73307300, 'T.END': 73307375, 'CG': '75='}, {'Q.START': 1183, 'Q.END': 1186, 'T.START': 73307376, 'T.END': 73307379, 'CG': '3='}, {'Q.START': 1224, 'Q.END': 1257, 'T.START': 73307419, 'T.END': 73307452, 'CG': '33='}, {'Q.START': 1289, 'Q.END': 1311, 'T.START': 73307475, 'T.END': 73307497, 'CG': '22='}, {'Q.START': 1359, 'Q.END': 1382, 'T.START': 73307546, 'T.END': 73307569, 'CG': '23='}, {'Q.START': 1434, 'Q.END': 1451, 'T.START': 73307643, 'T.END': 73307660, 'CG': '17='}, {'Q.START': 1451, 'Q.END': 1531, 'T.START': 73307661, 'T.END': 73307741, 'CG': '80='}, {'Q.START': 1532, 'Q.END': 1543, 'T.START': 73307744, 'T.END': 73307755, 'CG': '11='}, {'Q.START': 1544, 'Q.END': 1572, 'T.START': 73307758, 'T.END': 73307786, 'CG': '28='}, {'Q.START': 1572, 'Q.END': 1573, 'T.START': 73307787, 'T.END': 73307788, 'CG': '1='}, {'Q.START': 1573, 'Q.END': 1587, 'T.START': 73307789, 'T.END': 73307803, 'CG': '14='}, {'Q.START': 1588, 'Q.END': 1616, 'T.START': 73307806, 'T.END': 73307834, 'CG': '28='}, {'Q.START': 1616, 'Q.END': 1617, 'T.START': 73307835, 'T.END': 73307836, 'CG': '1='}, {'Q.START': 1617, 'Q.END': 1646, 'T.START': 73307837, 'T.END': 73307866, 'CG': '29='}, {'Q.START': 1646, 'Q.END': 1661, 'T.START': 73307867, 'T.END': 73307882, 'CG': '15='}, {'Q.START': 1661, 'Q.END': 1673, 'T.START': 73307883, 'T.END': 73307895, 'CG': '12='}, {'Q.START': 1673, 'Q.END': 1674, 'T.START': 73307896, 'T.END': 73307897, 'CG': '1='}, {'Q.START': 1674, 'Q.END': 1726, 'T.START': 73307898, 'T.END': 73307950, 'CG': '52='}, {'Q.START': 1727, 'Q.END': 1762, 'T.START': 73307953, 'T.END': 73307988, 'CG': '35='}, {'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}, {'Q.START': 1765, 'Q.END': 1766, 'T.START': 73307993, 'T.END': 73307994, 'CG': '1='}, {'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}, {'Q.START': 1765, 'Q.END': 1766, 'T.START': 73307993, 'T.END': 73307994, 'CG': '1='}, {'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}, {'Q.START': 1767, 'Q.END': 1824, 'T.START': 73307995, 'T.END': 73308052, 'CG': '57='}, {'Q.START': 1824, 'Q.END': 1825, 'T.START': 73308053, 'T.END': 73308054, 'CG': '1='}, {'Q.START': 1825, 'Q.END': 1975, 'T.START': 73308055, 'T.END': 73308205, 'CG': '150='}, {'Q.START': 1976, 'Q.END': 2015, 'T.START': 73308208, 'T.END': 73308247, 'CG': '39='}, {'Q.START': 2016, 'Q.END': 2047, 'T.START': 73308250, 'T.END': 73308281, 'CG': '31='}, {'Q.START': 2047, 'Q.END': 2055, 'T.START': 73308286, 'T.END': 73308294, 'CG': '8='}, {'Q.START': 2056, 'Q.END': 2120, 'T.START': 73308297, 'T.END': 73308361, 'CG': '64='}, {'Q.START': 2120, 'Q.END': 2121, 'T.START': 73308362, 'T.END': 73308363, 'CG': '1='}, {'Q.START': 2121, 'Q.END': 2157, 'T.START': 73308364, 'T.END': 73308400, 'CG': '36='}, {'Q.START': 2158, 'Q.END': 2170, 'T.START': 73308403, 'T.END': 73308415, 'CG': '12='}, {'Q.START': 2170, 'Q.END': 2171, 'T.START': 73308416, 'T.END': 73308417, 'CG': '1='}, {'Q.START': 2171, 'Q.END': 2205, 'T.START': 73308418, 'T.END': 73308452, 'CG': '34='}, {'Q.START': 2206, 'Q.END': 2344, 'T.START': 73308455, 'T.END': 73308593, 'CG': '138='}, {'Q.START': 2345, 'Q.END': 2364, 'T.START': 73308596, 'T.END': 73308615, 'CG': '19='}, {'Q.START': 2364, 'Q.END': 2383, 'T.START': 73308616, 'T.END': 73308635, 'CG': '19='}, {'Q.START': 2383, 'Q.END': 2408, 'T.START': 73308636, 'T.END': 73308661, 'CG': '25='}, {'Q.START': 2408, 'Q.END': 2409, 'T.START': 73308662, 'T.END': 73308663, 'CG': '1='}, {'Q.START': 2409, 'Q.END': 2441, 'T.START': 73308664, 'T.END': 73308696, 'CG': '32='}, {'Q.START': 2441, 'Q.END': 2442, 'T.START': 73308697, 'T.END': 73308698, 'CG': '1='}, {'Q.START': 2442, 'Q.END': 2580, 'T.START': 73308699, 'T.END': 73308837, 'CG': '138='}, {'Q.START': 2582, 'Q.END': 2583, 'T.START': 73308838, 'T.END': 73308839, 'CG': '1='}, {'Q.START': 2583, 'Q.END': 2584, 'T.START': 73308840, 'T.END': 73308841, 'CG': '1='}, {'Q.START': 2582, 'Q.END': 2583, 'T.START': 73308838, 'T.END': 73308839, 'CG': '1='}, {'Q.START': 2583, 'Q.END': 2584, 'T.START': 73308840, 'T.END': 73308841, 'CG': '1='}, {'Q.START': 2584, 'Q.END': 2764, 'T.START': 73308842, 'T.END': 73309022, 'CG': '180='}, {'Q.START': 2765, 'Q.END': 2797, 'T.START': 73309025, 'T.END': 73309057, 'CG': '32='}, {'Q.START': 2798, 'Q.END': 2878, 'T.START': 73309060, 'T.END': 73309140, 'CG': '80='}, {'Q.START': 2878, 'Q.END': 2879, 'T.START': 73309141, 'T.END': 73309142, 'CG': '1='}, {'Q.START': 2879, 'Q.END': 2951, 'T.START': 73309143, 'T.END': 73309215, 'CG': '72='}, {'Q.START': 2951, 'Q.END': 2952, 'T.START': 73309216, 'T.END': 73309217, 'CG': '1='}, {'Q.START': 2952, 'Q.END': 3002, 'T.START': 73309218, 'T.END': 73309268, 'CG': '50='}, {'Q.START': 3002, 'Q.END': 3077, 'T.START': 73309271, 'T.END': 73309346, 'CG': '75='}, {'Q.START': 3077, 'Q.END': 3078, 'T.START': 73309347, 'T.END': 73309348, 'CG': '1='}, {'Q.START': 3078, 'Q.END': 3093, 'T.START': 73309349, 'T.END': 73309364, 'CG': '15='}, {'Q.START': 3094, 'Q.END': 3097, 'T.START': 73309367, 'T.END': 73309370, 'CG': '3='}, {'Q.START': 3097, 'Q.END': 3140, 'T.START': 73309371, 'T.END': 73309414, 'CG': '43='}, {'Q.START': 3140, 'Q.END': 3210, 'T.START': 73309415, 'T.END': 73309485, 'CG': '70='}, {'Q.START': 3210, 'Q.END': 3211, 'T.START': 73309486, 'T.END': 73309487, 'CG': '1='}, {'Q.START': 3211, 'Q.END': 3229, 'T.START': 73309488, 'T.END': 73309506, 'CG': '18='}, {'Q.START': 3229, 'Q.END': 3230, 'T.START': 73309507, 'T.END': 73309508, 'CG': '1='}, {'Q.START': 3230, 'Q.END': 3276, 'T.START': 73309509, 'T.END': 73309555, 'CG': '46='}, {'Q.START': 3277, 'Q.END': 3315, 'T.START': 73309558, 'T.END': 73309596, 'CG': '38='}, {'Q.START': 3316, 'Q.END': 3322, 'T.START': 73309599, 'T.END': 73309605, 'CG': '6='}, {'Q.START': 3323, 'Q.END': 3348, 'T.START': 73309608, 'T.END': 73309633, 'CG': '25='}, {'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}, {'Q.START': 3351, 'Q.END': 3352, 'T.START': 73309636, 'T.END': 73309637, 'CG': '1='}, {'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}, {'Q.START': 3351, 'Q.END': 3352, 'T.START': 73309636, 'T.END': 73309637, 'CG': '1='}, {'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}, {'Q.START': 3353, 'Q.END': 3354, 'T.START': 73309638, 'T.END': 73309639, 'CG': '1='}, {'Q.START': 3354, 'Q.END': 3356, 'T.START': 73309640, 'T.END': 73309642, 'CG': '2='}, {'Q.START': 3357, 'Q.END': 3489, 'T.START': 73309645, 'T.END': 73309777, 'CG': '132='}, {'Q.START': 3490, 'Q.END': 3642, 'T.START': 73309780, 'T.END': 73309932, 'CG': '152='}, {'Q.START': 3644, 'Q.END': 3685, 'T.START': 73309933, 'T.END': 73309974, 'CG': '41='}, {'Q.START': 3687, 'Q.END': 3693, 'T.START': 73309977, 'T.END': 73309983, 'CG': '6='}, {'Q.START': 3694, 'Q.END': 3708, 'T.START': 73309986, 'T.END': 73310000, 'CG': '14='}, {'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}, {'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}, {'Q.START': 3716, 'Q.END': 3720, 'T.START': 73310005, 'T.END': 73310009, 'CG': '4='}, {'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}, {'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}, {'Q.START': 3716, 'Q.END': 3720, 'T.START': 73310005, 'T.END': 73310009, 'CG': '4='}, {'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}, {'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}]\n" + ] + } + ], + "source": [ + "print(ALNS[(\"D134#1#chr03\", \"ALN_1\")])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/gaf2aln.ipynb b/gaf2aln.ipynb new file mode 100644 index 0000000..04fe866 --- /dev/null +++ b/gaf2aln.ipynb @@ -0,0 +1,2443 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "4ffaf9f6-cc1e-4190-9351-5431c930d25b", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import argparse\n", + "import concurrent.futures\n", + "import os\n", + "import re\n", + "\n", + "# Replace for argparse arguments\n", + "class arguments():\n", + " gfa = \"/home/amergez/Documents/Scratch/LeChou/35Bra-v2a/35Bra-v2a.chr03.gfa\"\n", + " gaf = \"/home/amergez/Documents/Scratch/LeChou/35Bra-v2a/Mapping2Graph/GA.FLC2.aln.gaf\"\n", + " threads = 8\n", + " version = False\n", + "args = arguments()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "280c8847-22e8-4063-bde8-3e4e72cf20e7", + "metadata": {}, + "outputs": [], + "source": [ + "# Toolbox\n", + "def walk2path(walk):\n", + " \"\"\"\n", + " Takes a walk in a single string and returns a list of nodes id with signs (gfa v1 like)\n", + " \"\"\"\n", + " _ = re.findall(r'>\\w+|<\\w+', walk)\n", + " # Converting ['>..', '>..', '<..', '>..'] to '..+,..+,..-,..+'\n", + " return [f'{elem[1:]}{(elem[0] == \">\")*\"+\"+(elem[0] == \"<\")*\"-\"}' for elem in _]\n", + "\n", + "def cigar2basealn(cigar):\n", + " \"\"\"\n", + " Takes a CIGAR string and convert it into a list of base level alignment.\n", + " For example : \"345=\" -> [\"=\", \"=\", ..., \"=\"] of length 345.\n", + " \"\"\"\n", + " _ = re.findall(r'\\d+\\D', cigar)\n", + " final_cigar = []\n", + " for match in _:\n", + " final_cigar += [match[-1]]*int(match[:-1])\n", + "\n", + " return final_cigar\n", + "\n", + "def basealn2cigar(base_aln_list):\n", + " \n", + " last_elem = base_aln_list[0]\n", + " CIGAR = [[1, last_elem]]\n", + " for elem in base_aln_list[1:]:\n", + " if elem == last_elem:\n", + " CIGAR[-1][0] += 1\n", + "\n", + " else :\n", + " CIGAR[-1][0] = str(CIGAR[-1][0])\n", + " CIGAR.append([1, elem])\n", + " last_elem = elem\n", + " CIGAR[-1][0] = str(CIGAR[-1][0])\n", + " return \"\".join([\"\".join(block) for block in CIGAR if block[1] != \"\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "be12e9d4-de76-4c8b-af84-6567549483f4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[gaf2aln::GAF Parser] Reading /home/amergez/Documents/Scratch/LeChou/35Bra-v2a/Mapping2Graph/GA.FLC2.aln.gaf ...\n", + "[gaf2aln::GAF Parser] Extracting alignments ...\n", + "{'ALN_1': {'QRY.NAME': 'FLC2.TO1000#1#chr03', 'QRY.LEN': '3735', 'QRY.START': '0', 'QRY.END': '3735', 'STRAND': '+', 'PATH.MATCH': [('7046526', '+'), ('7046528', '+'), ('7046530', '+'), ('7046531', '+'), ('7046532', '+'), ('7046533', '+'), ('7046534', '+'), ('7046536', '+'), ('7046537', '+'), ('7046539', '+'), ('7046541', '+'), ('7046542', '+'), ('7046544', '+'), ('7046546', '+'), ('7046547', '+'), ('7046549', '+'), ('7046551', '+'), ('7046552', '+'), ('7046554', '+'), ('7046556', '+'), ('7046556', '+'), ('7046556', '+'), ('7046557', '+'), ('7046558', '+'), ('7046559', '+'), ('7046560', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046562', '+'), ('7046564', '+'), ('7046565', '+'), ('7046567', '+'), ('7046568', '+'), ('7046570', '+'), ('7046571', '+'), ('7046573', '+'), ('7046574', '+'), ('7046576', '+'), ('7046577', '+'), ('7046579', '+'), ('7046581', '+'), ('7046583', '+'), ('7046584', '+'), ('7046586', '+'), ('7046587', '+'), ('7046589', '+'), ('7046590', '+'), ('7046592', '+'), ('7046593', '+'), ('7046594', '+'), ('7046596', '+'), ('7046597', '+'), ('7046599', '+'), ('7046600', '+'), ('7046601', '+'), ('7046603', '+'), ('7046604', '+'), ('7046606', '+'), ('7046608', '+'), ('7046609', '+'), ('7046621', '+'), ('7046622', '+'), ('7046624', '+'), ('7046625', '+'), ('7046626', '+'), ('7046628', '+'), ('7046631', '+'), ('7046673', '+'), ('7046631', '+'), ('7046673', '+'), ('7046631', '+'), ('7046632', '+'), ('7046634', '+'), ('7046635', '+'), ('7046637', '+'), ('7046638', '+'), ('7046639', '+'), ('7046641', '+'), ('7046644', '+'), ('7046646', '+'), ('7046647', '+'), ('7046649', '+'), ('7046650', '+'), ('7046652', '+'), ('7046653', '+'), ('7046654', '+'), ('7046656', '+'), ('7046657', '+'), ('7046659', '+'), ('7046660', '+'), ('7046662', '+'), ('7046663', '+'), ('7046665', '+'), ('7046667', '+'), ('7046668', '+'), ('7046670', '+'), ('7046671', '+'), ('7046674', '+'), ('7046675', '+'), ('7046674', '+'), ('7046675', '+'), ('7046676', '+'), ('7046678', '+'), ('7046679', '+'), ('7046680', '+'), ('7046682', '+'), ('7046684', '+'), ('7046685', '+'), ('7046686', '+'), ('7046688', '+'), ('7046690', '+'), ('7046692', '+'), ('7046693', '+'), ('7046695', '+'), ('7046696', '+'), ('7046698', '+'), ('7046700', '+'), ('7046702', '+'), ('7046703', '+'), ('7046704', '+'), ('7046706', '+'), ('7046707', '+'), ('7046709', '+'), ('7046710', '+'), ('7046712', '+'), ('7046713', '+'), ('7046715', '+'), ('7046718', '+'), ('7046717', '+'), ('7046718', '+'), ('7046717', '+'), ('7046718', '+'), ('7046720', '+'), ('7046722', '+'), ('7046724', '+'), ('7046725', '+'), ('7046727', '+'), ('7046728', '+'), ('7046729', '+'), ('7046730', '+'), ('7046731', '+'), ('7046733', '+'), ('7046735', '+'), ('7046736', '+'), ('7046738', '+'), ('7046739', '+'), ('7046740', '+'), ('7046738', '+'), ('7046739', '+'), ('7046740', '+'), ('7046738', '+'), ('7046739', '+'), ('7046741', '+')], 'PATH.LEN': '3822', 'ALN.START': '77', 'ALN.END': '3812', 'RES.MATCH': '3735', 'ALN.BLOCK.LEN': '3735', 'MAPPING.QUAL': '60', 'RAW.CIGAR': 'cg:Z:3735=', 'TAGS': 'AS:f:3735,dv:f:0,id:f:1'}, 'ALN_2': {'QRY.NAME': 'FLC2.TO1000#1#chr03', 'QRY.LEN': '3735', 'QRY.START': '0', 'QRY.END': '3735', 'STRAND': '+', 'PATH.MATCH': [('7594382', '+'), ('7594369', '+'), ('7594371', '+'), ('7594021', '+'), ('7594286', '+'), ('7594374', '+'), ('7594356', '+'), ('7594374', '+'), ('7594374', '+'), ('7594375', '+'), ('7594626', '+'), ('7594011', '+'), ('7594374', '+'), ('7594375', '+'), ('7594369', '+'), ('7594371', '+'), ('7594021', '+'), ('7594021', '+'), ('7594021', '+'), ('7594021', '+'), ('7594241', '+'), ('7594248', '+'), ('7594286', '+'), ('7594311', '+'), ('7594315', '+'), ('7594311', '+'), ('7594330', '+'), ('7594311', '+'), ('7594315', '+'), ('7594374', '+'), ('7594311', '+'), ('7594374', '+'), ('7594369', '+'), ('7594021', '+'), ('7594026', '+'), ('7594021', '+'), ('7594021', '+'), ('7594026', '+'), ('7594021', '+'), ('7594021', '+'), ('7594021', '+'), ('7594021', '+'), ('7594286', '+'), ('7594374', '+'), ('7594021', '+'), ('7594286', '+'), ('7594311', '+'), ('7594286', '+'), ('7594311', '+'), ('7594286', '+'), ('7594311', '+'), ('7594286', '+'), ('7594311', '+'), ('7594315', '+'), ('7594286', '+'), ('7594311', '+'), ('7594374', '+'), ('7594021', '+'), ('7594286', '+'), ('7594286', '+'), ('7594374', '+'), ('7594356', '+'), ('7594374', '+'), ('7594374', '+'), ('7594375', '+'), ('7594374', '+'), ('7594356', '+'), ('7594374', '+'), ('7594375', '+'), ('7594374', '+'), ('7594350', '+'), ('7594264', '+'), ('7594207', '+'), ('7594225', '+'), ('7594227', '+'), ('7594120', '+'), ('7594132', '+'), ('7594165', '+'), ('7594172', '+')], 'PATH.LEN': '61224', 'ALN.START': '0', 'ALN.END': '3735', 'RES.MATCH': '3734', 'ALN.BLOCK.LEN': '3735', 'MAPPING.QUAL': '0', 'RAW.CIGAR': 'cg:Z:57=1X3677=', 'TAGS': 'AS:f:3732.06,dv:f:0.000267738,id:f:0.999732'}}\n" + ] + } + ], + "source": [ + "# Parsing the .gaf file\n", + "print(f\"[gaf2aln::GAF Parser] Reading {args.gaf} ...\")\n", + "with open(args.gaf, 'r') as file:\n", + " gaf_lines = file.readlines()\n", + "\n", + "gaf_col = [\n", + " \"QRY.NAME\", \"QRY.LEN\", \"QRY.START\", \"QRY.END\", \"STRAND\", \n", + " \"PATH.MATCH\", \"PATH.LEN\", \"ALN.START\", \"ALN.END\",\n", + " \"RES.MATCH\", \"ALN.BLOCK.LEN\", \"MAPPING.QUAL\"\n", + " ]\n", + "\n", + "# Creating dictionnary to store alignments\n", + "print(f\"[gaf2aln::GAF Parser] Extracting alignments ...\")\n", + "aln_dict = {}\n", + "for line in range(len(gaf_lines)):\n", + " ## Splitting the line by tabulation\n", + " line_content = gaf_lines[line][:-1].split('\\t')\n", + "\n", + " ## Adding alignement info to dictionnary\n", + " aln_dict[f\"ALN_{line+1}\"] = {\n", + " gaf_col[i]: line_content[i] for i in range(len(gaf_col))\n", + " }\n", + " \n", + " ## Splitting \"PATH.MATCH\" into a list\n", + " aln_dict[f\"ALN_{line+1}\"][\"PATH.MATCH\"] = [\n", + " (str(node_id[:-1]), node_id[-1]) \n", + " for node_id in walk2path(aln_dict[f\"ALN_{line+1}\"][\"PATH.MATCH\"])\n", + " ]\n", + "\n", + " ## Adding CIGAR\n", + " aln_dict[f\"ALN_{line+1}\"][\"RAW.CIGAR\"] = line_content[-1]\n", + "\n", + " ## Adding tags\n", + " aln_dict[f\"ALN_{line+1}\"][\"TAGS\"] = \",\".join(line_content[13:-1])\n", + "\n", + "# Getting nodes of interest ids\n", + "aln_nodes = np.unique([\n", + " str(node_id) \n", + " for aln in aln_dict.keys() \n", + " for node_id, orient in aln_dict[aln][\"PATH.MATCH\"]\n", + "]).tolist()\n", + "\n", + "print(aln_dict)\n", + "del gaf_lines, gaf_col" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2f891424-0d88-4fd3-99ff-b0a8c90587ff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[gaf2aln::GFA Parser] Reading /home/amergez/Documents/Scratch/LeChou/35Bra-v2a/35Bra-v2a.chr03.gfa ...\n", + "[gaf2aln::GFA Parser] Extracting nodes, paths and links ...\n" + ] + } + ], + "source": [ + "# Parsing the .gfa\n", + "print(f\"[gaf2aln::GFA Parser] Reading {args.gfa} ...\")\n", + "with open(args.gfa, 'r') as file:\n", + " gfa_lines = file.readlines()\n", + "\n", + "# Nodes length dictionnary structured as follow :\n", + "# {<NODE.ID>: <NODE.LENGTH>}\n", + "nodes_length = {}\n", + "# Nodes dictionnary structured as follow :\n", + "# { <ALN.NODE.ID> : {\n", + "# <PATH.NAME>: {\"START\": start, \"END\": end, \"STRAND\": strand), \n", + "# <ALN.NAME>: {\"START\": start, \"END\": end, \"S.OFF\": start.offset, \"E.OFF\": end.offset, \"STRAND\": strand, \"CIGAR\": CIGAR}\n", + "# }\n", + "# }\n", + "nodes = {node_id: {} for node_id in aln_nodes}\n", + "# Paths dictionnary structured as follow :\n", + "# {<PATH.NAME>: {NODES: {<NODE.ID>: <NODE.ORIENT>}, CIGAR: <CIGAR in comma separated list>}\n", + "paths = {}\n", + "# Links dictionnary structured as follow : \n", + "# {<FROM.NODE.ID>: {<TO.NODE.ID>: {FROM.ORIENT: <FROM.ORIENT>, TO.ORIENT: <TO.ORIENT>}}}\n", + "links = {}\n", + "\n", + "# Parsing the gfa\n", + "print(f\"[gaf2aln::GFA Parser] Extracting nodes, paths and links ...\")\n", + "\n", + "def GFA_parser(gfa_lines, nodes = nodes):\n", + " _links, _nodes, _nodes_length, paths = {}, {}, {}, {}\n", + " for line in gfa_lines:\n", + " line_content = line[:-1].split(\"\\t\")\n", + " line_id = line_content[0]\n", + " \n", + " # Segment line\n", + " if line_id == \"S\" :\n", + " \n", + " _nodes_length[str(line_content[1])] = len(line_content[2])\n", + " \n", + " # Link line\n", + " elif line_id == \"L\":\n", + " try :\n", + " _links[str(line_content[1])][str(line_content[3])] = {\n", + " \"FROM\": str(line_content[2]), \n", + " \"TO\": str(line_content[4])\n", + " }\n", + "\n", + " except :\n", + " _links[str(line_content[1])] = {\n", + " str(line_content[3]) : {\"FROM.ORIENT\": str(line_content[2]), \"TO.ORIENT\": str(line_content[4])}\n", + " }\n", + "\n", + " # Path line\n", + " elif line_id == \"P\":\n", + " _paths[str(line_content[1])] = {\n", + " \"NODES\": {\n", + " str(node_id[:-1]): str(node_id[-1])\n", + " for node_id in line_content[2].split(',')\n", + " },\n", + " \"CIGAR\": line_content[3]\n", + " }\n", + "\n", + " return [_links, _nodes, _nodes_length, _paths]\n", + "\n", + "# splits = np.quantile(range(len(gfa_lines)+1), q= np.array(args.threads+1)/args.threads, method='higher').tolist()\n", + "# res = []\n", + "# for i in range(1, len(splits)):\n", + "# res.append(executor.submit(GFA_parser, gfa_lines[splits[i-1]:splits[i]]))\n", + "\n", + "# for out in res:\n", + "# results = out.result()\n", + "\n", + "# for link_id, link_info in results[0].items():\n", + "# links[]\n", + "\n", + "\n", + "for line in gfa_lines:\n", + " line_content = line[:-1].split(\"\\t\")\n", + " line_id = line_content[0]\n", + " \n", + " # Segment line\n", + " if line_id == \"S\" :\n", + " \n", + " nodes_length[str(line_content[1])] = len(line_content[2])\n", + " \n", + " # Link line\n", + " elif line_id == \"L\":\n", + " try :\n", + " links[str(line_content[1])][str(line_content[3])] = {\n", + " \"FROM\": str(line_content[2]), \n", + " \"TO\": str(line_content[4])\n", + " }\n", + "\n", + " except :\n", + " links[str(line_content[1])] = {\n", + " str(line_content[3]) : {\"FROM.ORIENT\": str(line_content[2]), \"TO.ORIENT\": str(line_content[4])}\n", + " }\n", + "\n", + " # Path line\n", + " elif line_id == \"P\":\n", + " paths[str(line_content[1])] = {\n", + " \"NODES\": {\n", + " str(node_id[:-1]): str(node_id[-1])\n", + " for node_id in line_content[2].split(',')\n", + " },\n", + " \"CIGAR\": line_content[3]\n", + " }\n", + "\n", + "del gfa_lines" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a403c88e-54ea-4a67-9047-dc44eba7f51a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[gaf2aln::Graph position processing] Computing nodes positions in each paths...\n", + "[gaf2aln::Graph position processing] Running on Capitata#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on D101#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on D134#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on G06-09-28#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on G07-DH-33#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on HDEM#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on Korso#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on M249#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on OX-heart#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on PL021#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on RC34#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T02#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T03#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T04#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T06#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T07#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T08#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T09#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T10#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T11#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T12#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T13#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T14#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T15#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T16#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T17#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T18#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T19#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T21#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T24#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T25#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T26#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on T27#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on TO1000#1#chr03 ...\n", + "[gaf2aln::Graph position processing] Running on W1701#1#chr03 ...\n" + ] + } + ], + "source": [ + "print(f\"[gaf2aln::Graph position processing] Computing nodes positions in each paths...\")\n", + "def get_node_pos(path_name, nodes = nodes, paths = paths, nodes_length = nodes_length):\n", + " print(f\"[gaf2aln::Graph position processing] Running on {path_name} ...\")\n", + " cur_pos = 0\n", + "\n", + " out = {}\n", + " # Iterating over nodes in the path\n", + " for path_node in paths[path_name][\"NODES\"].keys():\n", + " # Instead of checking if the node is one interesting node, we try to add to the nodes dict\n", + " if path_node in aln_nodes :\n", + " out[path_node] = {\n", + " \"START\": cur_pos, # Start position of the node start in the currrent path\n", + " \"END\": cur_pos+nodes_length[path_node], # End position of the node end in the current path\n", + " \"STRAND\": paths[path_name][\"NODES\"][path_node] # Orientation of the node in the current path\n", + " } \n", + "\n", + " cur_pos += nodes_length[path_node]+1\n", + " else :\n", + " cur_pos += nodes_length[path_node]+1\n", + "\n", + " return out\n", + "\n", + "res = {}\n", + "executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.threads)\n", + "# Adding nodes positions relative to path\n", + "for path_name in paths.keys():\n", + " res[path_name] = executor.submit(get_node_pos, path_name)\n", + "\n", + "executor.shutdown(wait=True)\n", + "\n", + "for path_name, out in res.items():\n", + " results = out.result()\n", + " for path_node, node_pos in results.items():\n", + " nodes[path_node][path_name] = node_pos\n", + "\n", + "del res" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bed36bd5-30eb-4d02-8b52-1ae5d753f8f8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[gaf2aln::Alignment position processing] Computing nodes positions in each alignement...\n", + "[gaf2aln::Alignment position processing] Running on ALN_1 ...\n", + "0 77 77 0 + 154 77\n", + "77 82 0 0 + 5 82\n", + "82 83 0 0 + 1 83\n", + "83 138 0 0 + 55 138\n", + "138 139 0 0 + 1 139\n", + "139 202 0 0 + 63 202\n", + "202 203 0 0 + 1 203\n", + "203 379 0 0 + 176 379\n", + "379 380 0 0 + 1 380\n", + "380 429 0 0 + 49 429\n", + "429 430 0 0 + 1 430\n", + "430 457 0 0 + 27 457\n", + "457 492 0 0 + 35 492\n", + "492 494 0 0 + 2 494\n", + "494 497 0 0 + 3 497\n", + "497 507 0 0 + 10 507\n", + "507 508 0 0 + 1 508\n", + "508 564 0 0 + 56 564\n", + "564 566 0 0 + 2 566\n", + "566 567 0 0 + 1 567\n", + "567 568 0 0 + 1 568\n", + "568 569 0 0 + 1 569\n", + "569 824 0 0 + 255 824\n", + "824 826 0 0 + 2 826\n", + "826 858 0 0 + 32 858\n", + "858 859 0 0 + 1 859\n", + "859 860 0 0 + 1 860\n", + "860 861 0 0 + 1 861\n", + "861 862 0 0 + 1 862\n", + "862 863 0 0 + 1 863\n", + "863 864 0 0 + 1 864\n", + "864 865 0 0 + 1 865\n", + "865 866 0 0 + 1 866\n", + "866 867 0 0 + 1 867\n", + "867 868 0 0 + 1 868\n", + "868 869 0 0 + 1 869\n", + "869 913 0 0 + 44 913\n", + "913 919 0 0 + 6 919\n", + "919 978 0 0 + 59 978\n", + "978 979 0 0 + 1 979\n", + "979 1038 0 0 + 59 1038\n", + "1038 1045 0 0 + 7 1045\n", + "1045 1046 0 0 + 1 1046\n", + "1046 1080 0 0 + 34 1080\n", + "1080 1081 0 0 + 1 1081\n", + "1081 1107 0 0 + 26 1107\n", + "1107 1108 0 0 + 1 1108\n", + "1108 1183 0 0 + 75 1183\n", + "1183 1186 0 0 + 3 1186\n", + "1186 1224 0 0 + 38 1224\n", + "1224 1257 0 0 + 33 1257\n", + "1257 1289 0 0 + 32 1289\n", + "1289 1311 0 0 + 22 1311\n", + "1311 1359 0 0 + 48 1359\n", + "1359 1382 0 0 + 23 1382\n", + "1382 1434 0 0 + 52 1434\n", + "1434 1451 0 0 + 17 1451\n", + "1451 1531 0 0 + 80 1531\n", + "1531 1532 0 0 + 1 1532\n", + "1532 1543 0 0 + 11 1543\n", + "1543 1544 0 0 + 1 1544\n", + "1544 1572 0 0 + 28 1572\n", + "1572 1573 0 0 + 1 1573\n", + "1573 1587 0 0 + 14 1587\n", + "1587 1588 0 0 + 1 1588\n", + "1588 1616 0 0 + 28 1616\n", + "1616 1617 0 0 + 1 1617\n", + "1617 1646 0 0 + 29 1646\n", + "1646 1661 0 0 + 15 1661\n", + "1661 1673 0 0 + 12 1673\n", + "1673 1674 0 0 + 1 1674\n", + "1674 1726 0 0 + 52 1726\n", + "1726 1727 0 0 + 1 1727\n", + "1727 1762 0 0 + 35 1762\n", + "1762 1763 0 0 + 1 1763\n", + "1763 1764 0 0 + 1 1764\n", + "1764 1765 0 0 + 1 1765\n", + "1765 1766 0 0 + 1 1766\n", + "1766 1767 0 0 + 1 1767\n", + "1767 1824 0 0 + 57 1824\n", + "1824 1825 0 0 + 1 1825\n", + "1825 1975 0 0 + 150 1975\n", + "1975 1976 0 0 + 1 1976\n", + "1976 2015 0 0 + 39 2015\n", + "2015 2016 0 0 + 1 2016\n", + "2016 2047 0 0 + 31 2047\n", + "2047 2055 0 0 + 8 2055\n", + "2055 2056 0 0 + 1 2056\n", + "2056 2120 0 0 + 64 2120\n", + "2120 2121 0 0 + 1 2121\n", + "2121 2157 0 0 + 36 2157\n", + "2157 2158 0 0 + 1 2158\n", + "2158 2170 0 0 + 12 2170\n", + "2170 2171 0 0 + 1 2171\n", + "2171 2205 0 0 + 34 2205\n", + "2205 2206 0 0 + 1 2206\n", + "2206 2344 0 0 + 138 2344\n", + "2344 2345 0 0 + 1 2345\n", + "2345 2364 0 0 + 19 2364\n", + "2364 2383 0 0 + 19 2383\n", + "2383 2408 0 0 + 25 2408\n", + "2408 2409 0 0 + 1 2409\n", + "2409 2441 0 0 + 32 2441\n", + "2441 2442 0 0 + 1 2442\n", + "2442 2580 0 0 + 138 2580\n", + "2580 2581 0 0 + 1 2581\n", + "2581 2582 0 0 + 1 2582\n", + "2582 2583 0 0 + 1 2583\n", + "2583 2584 0 0 + 1 2584\n", + "2584 2764 0 0 + 180 2764\n", + "2764 2765 0 0 + 1 2765\n", + "2765 2797 0 0 + 32 2797\n", + "2797 2798 0 0 + 1 2798\n", + "2798 2878 0 0 + 80 2878\n", + "2878 2879 0 0 + 1 2879\n", + "2879 2951 0 0 + 72 2951\n", + "2951 2952 0 0 + 1 2952\n", + "2952 3002 0 0 + 50 3002\n", + "3002 3077 0 0 + 75 3077\n", + "3077 3078 0 0 + 1 3078\n", + "3078 3093 0 0 + 15 3093\n", + "3093 3094 0 0 + 1 3094\n", + "3094 3097 0 0 + 3 3097\n", + "3097 3140 0 0 + 43 3140\n", + "3140 3210 0 0 + 70 3210\n", + "3210 3211 0 0 + 1 3211\n", + "3211 3229 0 0 + 18 3229\n", + "3229 3230 0 0 + 1 3230\n", + "3230 3276 0 0 + 46 3276\n", + "3276 3277 0 0 + 1 3277\n", + "3277 3315 0 0 + 38 3315\n", + "3315 3316 0 0 + 1 3316\n", + "3316 3322 0 0 + 6 3322\n", + "3322 3323 0 0 + 1 3323\n", + "3323 3348 0 0 + 25 3348\n", + "3348 3349 0 0 + 1 3349\n", + "3349 3350 0 0 + 1 3350\n", + "3350 3351 0 0 + 1 3351\n", + "3351 3352 0 0 + 1 3352\n", + "3352 3353 0 0 + 1 3353\n", + "3353 3354 0 0 + 1 3354\n", + "3354 3356 0 0 + 2 3356\n", + "3356 3357 0 0 + 1 3357\n", + "3357 3489 0 0 + 132 3489\n", + "3489 3490 0 0 + 1 3490\n", + "3490 3642 0 0 + 152 3642\n", + "3642 3644 0 0 + 2 3644\n", + "3644 3685 0 0 + 41 3685\n", + "3685 3687 0 0 + 2 3687\n", + "3687 3693 0 0 + 6 3693\n", + "3693 3694 0 0 + 1 3694\n", + "3694 3708 0 0 + 14 3708\n", + "3708 3709 0 0 + 1 3709\n", + "3709 3710 0 0 + 1 3710\n", + "3710 3714 0 0 + 4 3714\n", + "3714 3715 0 0 + 1 3715\n", + "3715 3716 0 0 + 1 3716\n", + "3716 3720 0 0 + 4 3720\n", + "3720 3721 0 0 + 1 3721\n", + "3721 3722 0 0 + 1 3722\n", + "3722 3735 0 10 + 23 3735\n", + "[gaf2aln::Alignment position processing] Running on ALN_2 ...\n", + "0 1 0 0 + 1 1\n", + "1 2 0 0 + 1 2\n", + "2 3 0 0 + 1 3\n", + "3 4 0 0 + 1 4\n", + "4 5 0 0 + 1 5\n", + "5 6 0 0 + 1 6\n", + "6 7 0 0 + 1 7\n", + "7 8 0 0 + 1 8\n", + "8 9 0 0 + 1 9\n", + "9 10 0 0 + 1 10\n", + "10 11 0 0 + 1 11\n", + "11 12 0 0 + 1 12\n", + "12 13 0 0 + 1 13\n", + "13 14 0 0 + 1 14\n", + "14 15 0 0 + 1 15\n", + "15 16 0 0 + 1 16\n", + "16 17 0 0 + 1 17\n", + "17 18 0 0 + 1 18\n", + "18 19 0 0 + 1 19\n", + "19 20 0 0 + 1 20\n", + "20 21 0 0 + 1 21\n", + "21 22 0 0 + 1 22\n", + "22 23 0 0 + 1 23\n", + "23 24 0 0 + 1 24\n", + "24 25 0 0 + 1 25\n", + "25 26 0 0 + 1 26\n", + "26 27 0 0 + 1 27\n", + "27 28 0 0 + 1 28\n", + "28 29 0 0 + 1 29\n", + "29 30 0 0 + 1 30\n", + "30 31 0 0 + 1 31\n", + "31 32 0 0 + 1 32\n", + "32 33 0 0 + 1 33\n", + "33 34 0 0 + 1 34\n", + "34 35 0 0 + 1 35\n", + "35 36 0 0 + 1 36\n", + "36 37 0 0 + 1 37\n", + "37 38 0 0 + 1 38\n", + "38 39 0 0 + 1 39\n", + "39 40 0 0 + 1 40\n", + "40 41 0 0 + 1 41\n", + "41 42 0 0 + 1 42\n", + "42 43 0 0 + 1 43\n", + "43 44 0 0 + 1 44\n", + "44 45 0 0 + 1 45\n", + "45 46 0 0 + 1 46\n", + "46 47 0 0 + 1 47\n", + "47 48 0 0 + 1 48\n", + "48 49 0 0 + 1 49\n", + "49 50 0 0 + 1 50\n", + "50 51 0 0 + 1 51\n", + "51 52 0 0 + 1 52\n", + "52 53 0 0 + 1 53\n", + "53 54 0 0 + 1 54\n", + "54 55 0 0 + 1 55\n", + "55 56 0 0 + 1 56\n", + "56 57 0 0 + 1 57\n", + "57 58 0 0 + 1 58\n", + "58 59 0 0 + 1 59\n", + "59 60 0 0 + 1 60\n", + "60 61 0 0 + 1 61\n", + "61 62 0 0 + 1 62\n", + "62 63 0 0 + 1 63\n", + "63 64 0 0 + 1 64\n", + "64 65 0 0 + 1 65\n", + "65 66 0 0 + 1 66\n", + "66 67 0 0 + 1 67\n", + "67 68 0 0 + 1 68\n", + "68 69 0 0 + 1 69\n", + "69 70 0 0 + 1 70\n", + "70 71 0 0 + 1 71\n", + "71 72 0 0 + 1 72\n", + "72 73 0 0 + 1 73\n", + "73 74 0 0 + 1 74\n", + "74 75 0 0 + 1 75\n", + "75 76 0 0 + 1 76\n", + "76 77 0 0 + 1 77\n", + "77 78 0 0 + 1 78\n", + "78 3735 0 57489 + 61146 3735\n" + ] + } + ], + "source": [ + "print(f\"[gaf2aln::Alignment position processing] Computing nodes positions in each alignement...\")\n", + "# Adding nodes positions relative to path\n", + "\n", + "def get_aln_node_info(aln_name, aln_dict = aln_dict, nodes_length = nodes_length):\n", + " # Initializing current position in query\n", + " cur_pos = 0\n", + "\n", + " # Getting start and end node ids\n", + " start_end_id = (aln_dict[aln_name][\"PATH.MATCH\"][0][0], aln_dict[aln_name][\"PATH.MATCH\"][-1][0])\n", + "\n", + " # Creating result dictionnary\n", + " res = {}\n", + "\n", + " ## Iterating over node_ids from the given alignment\n", + " for node_id, orient in aln_dict[aln_name][\"PATH.MATCH\"]:\n", + " # Adding entry for current node\n", + " res[node_id] = {aln_name: {}}\n", + "\n", + " # First node\n", + " if node_id == start_end_id[0]:\n", + " start_pos = 0\n", + " s_off = int(aln_dict[aln_name][\"ALN.START\"])\n", + " end_pos = nodes_length[node_id]-s_off\n", + " e_off = 0\n", + " # End node\n", + " elif node_id == start_end_id[1]:\n", + " start_pos = cur_pos\n", + " s_off = 0\n", + " end_pos = int(aln_dict[aln_name][\"QRY.END\"])\n", + " e_off = nodes_length[node_id]-(end_pos-cur_pos)\n", + " # Node in between\n", + " else :\n", + " start_pos = cur_pos\n", + " s_off, e_off = 0, 0\n", + " end_pos = cur_pos+nodes_length[node_id]\n", + "\n", + " res[node_id] = {\n", + " \"START\": start_pos, # Start position on the query\n", + " \"END\": end_pos, # End position on the query\n", + " \"S.OFF\": s_off, # Offset between the start of the alignment and the node's start\n", + " \"E.OFF\": e_off, # Offset between the end of the alignment and the node's end \n", + " \"STRAND\": orient # Orientation of the node in the alignment\n", + " }\n", + " \n", + " cur_pos = end_pos\n", + " print(start_pos, end_pos, s_off, e_off, orient, nodes_length[node_id], cur_pos)\n", + "\n", + " return res\n", + "\n", + "# Storing alignement \n", + "res = {}\n", + "executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.threads)\n", + "for aln_name in aln_dict.keys():\n", + " print(f\"[gaf2aln::Alignment position processing] Running on {aln_name} ...\")\n", + " \n", + " res[aln_name] = executor.submit(get_aln_node_info, aln_name)\n", + " #res[aln_name] = get_aln_node_info(aln_name, aln_dict = aln_dict, nodes_length = nodes_length)\n", + "\n", + "executor.shutdown(wait=True)\n", + "\n", + "for aln_name, node_info in res.items():\n", + " results = node_info.result()\n", + " for node_id, info in results.items():\n", + " nodes[node_id][aln_name] = info\n", + "\n", + "del res" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4c30727c-7ffc-4852-ad81-ca2a5a7f9957", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[gaf2aln::CIGAR processing] Computing nodes cigar from alignement ...\n", + "[gaf2aln::CIGAR processing] Running on ALN_1 ...\n", + "[gaf2aln::CIGAR processing] Running on ALN_2 ...\n" + ] + } + ], + "source": [ + "# Calculating CIGAR for each nodes in each aln\n", + "print(f\"[gaf2aln::CIGAR processing] Computing nodes cigar from alignement ...\")\n", + "# Iterating over alignments\n", + "for aln in aln_dict.keys():\n", + " \n", + " print(f\"[gaf2aln::CIGAR processing] Running on {aln} ...\")\n", + " # Getting the list of base level alignement ([\"=\", \"X\", ...] from \"1=1X...\")\n", + " raw_cigar = cigar2basealn(aln_dict[aln][\"RAW.CIGAR\"])\n", + " CIGAR={}\n", + "\n", + " for node_id, orient in aln_dict[aln][\"PATH.MATCH\"]:\n", + "\n", + " _cigar = basealn2cigar(raw_cigar[\n", + " nodes[node_id][aln][\"START\"]:nodes[node_id][aln][\"END\"]\n", + " ])\n", + " nodes[node_id][aln][\"CIGAR\"] = _cigar\n", + " #print(_cigar, nodes[node_id][aln][\"START\"], nodes[node_id][aln][\"END\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e15e4762-cd71-4afe-bc74-ebe44869fee6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ALN_1\n", + "7046526 D134#1#chr03 0 77\n", + "\tIn path\n", + "\t 73306158 73306235\n", + "skipped\n", + "\n", + "7046528 D134#1#chr03 77 82\n", + "\tIn path\n", + "\t 73306238 73306243\n", + "{'Q.START': 0, 'Q.END': 77, 'T.START': 73306158, 'T.END': 73306235, 'CG': '77='}\n", + "7046530 D134#1#chr03 82 83\n", + "\tNot in path\n", + "7046531 D134#1#chr03 83 138\n", + "\tIn path\n", + "\t 73306246 73306301\n", + "{'Q.START': 77, 'Q.END': 82, 'T.START': 73306238, 'T.END': 73306243, 'CG': '5='}\n", + "7046532 D134#1#chr03 138 139\n", + "\tNot in path\n", + "7046533 D134#1#chr03 139 202\n", + "\tIn path\n", + "\t 73306302 73306365\n", + "{'Q.START': 83, 'Q.END': 138, 'T.START': 73306246, 'T.END': 73306301, 'CG': '55='}\n", + "7046534 D134#1#chr03 202 203\n", + "\tIn path\n", + "\t 73306366 73306367\n", + "{'Q.START': 139, 'Q.END': 202, 'T.START': 73306302, 'T.END': 73306365, 'CG': '63='}\n", + "7046536 D134#1#chr03 203 379\n", + "\tIn path\n", + "\t 73306368 73306544\n", + "{'Q.START': 202, 'Q.END': 203, 'T.START': 73306366, 'T.END': 73306367, 'CG': '1='}\n", + "7046537 D134#1#chr03 379 380\n", + "\tIn path\n", + "\t 73306545 73306546\n", + "{'Q.START': 203, 'Q.END': 379, 'T.START': 73306368, 'T.END': 73306544, 'CG': '176='}\n", + "7046539 D134#1#chr03 380 429\n", + "\tIn path\n", + "\t 73306547 73306596\n", + "{'Q.START': 379, 'Q.END': 380, 'T.START': 73306545, 'T.END': 73306546, 'CG': '1='}\n", + "7046541 D134#1#chr03 429 430\n", + "\tIn path\n", + "\t 73306597 73306598\n", + "{'Q.START': 380, 'Q.END': 429, 'T.START': 73306547, 'T.END': 73306596, 'CG': '49='}\n", + "7046542 D134#1#chr03 430 457\n", + "\tIn path\n", + "\t 73306599 73306626\n", + "{'Q.START': 429, 'Q.END': 430, 'T.START': 73306597, 'T.END': 73306598, 'CG': '1='}\n", + "7046544 D134#1#chr03 457 492\n", + "\tIn path\n", + "\t 73306641 73306676\n", + "{'Q.START': 430, 'Q.END': 457, 'T.START': 73306599, 'T.END': 73306626, 'CG': '27='}\n", + "7046546 D134#1#chr03 492 494\n", + "\tNot in path\n", + "7046547 D134#1#chr03 494 497\n", + "\tNot in path\n", + "7046549 D134#1#chr03 497 507\n", + "\tNot in path\n", + "7046551 D134#1#chr03 507 508\n", + "\tNot in path\n", + "7046552 D134#1#chr03 508 564\n", + "\tIn path\n", + "\t 73306694 73306750\n", + "{'Q.START': 457, 'Q.END': 492, 'T.START': 73306641, 'T.END': 73306676, 'CG': '35='}\n", + "7046554 D134#1#chr03 564 566\n", + "\tNot in path\n", + "7046556 D134#1#chr03 568 569\n", + "\tIn path\n", + "\t 73306753 73306754\n", + "{'Q.START': 508, 'Q.END': 564, 'T.START': 73306694, 'T.END': 73306750, 'CG': '56='}\n", + "7046556 D134#1#chr03 568 569\n", + "\tIn path\n", + "\t 73306753 73306754\n", + "{'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}\n", + "7046556 D134#1#chr03 568 569\n", + "\tIn path\n", + "\t 73306753 73306754\n", + "{'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}\n", + "7046557 D134#1#chr03 569 824\n", + "\tIn path\n", + "\t 73306755 73307010\n", + "{'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}\n", + "7046558 D134#1#chr03 824 826\n", + "\tNot in path\n", + "7046559 D134#1#chr03 826 858\n", + "\tIn path\n", + "\t 73307011 73307043\n", + "{'Q.START': 569, 'Q.END': 824, 'T.START': 73306755, 'T.END': 73307010, 'CG': '255='}\n", + "7046560 D134#1#chr03 858 859\n", + "\tIn path\n", + "\t 73307044 73307045\n", + "{'Q.START': 826, 'Q.END': 858, 'T.START': 73307011, 'T.END': 73307043, 'CG': '32='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 858, 'Q.END': 859, 'T.START': 73307044, 'T.END': 73307045, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046561 D134#1#chr03 868 869\n", + "\tIn path\n", + "\t 73307046 73307047\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046562 D134#1#chr03 869 913\n", + "\tIn path\n", + "\t 73307048 73307092\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", + "7046564 D134#1#chr03 913 919\n", + "\tIn path\n", + "\t 73307093 73307099\n", + "{'Q.START': 869, 'Q.END': 913, 'T.START': 73307048, 'T.END': 73307092, 'CG': '44='}\n", + "7046565 D134#1#chr03 919 978\n", + "\tIn path\n", + "\t 73307100 73307159\n", + "{'Q.START': 913, 'Q.END': 919, 'T.START': 73307093, 'T.END': 73307099, 'CG': '6='}\n", + "7046567 D134#1#chr03 978 979\n", + "\tIn path\n", + "\t 73307160 73307161\n", + "{'Q.START': 919, 'Q.END': 978, 'T.START': 73307100, 'T.END': 73307159, 'CG': '59='}\n", + "7046568 D134#1#chr03 979 1038\n", + "\tIn path\n", + "\t 73307162 73307221\n", + "{'Q.START': 978, 'Q.END': 979, 'T.START': 73307160, 'T.END': 73307161, 'CG': '1='}\n", + "7046570 D134#1#chr03 1038 1045\n", + "\tIn path\n", + "\t 73307224 73307231\n", + "{'Q.START': 979, 'Q.END': 1038, 'T.START': 73307162, 'T.END': 73307221, 'CG': '59='}\n", + "7046571 D134#1#chr03 1045 1046\n", + "\tIn path\n", + "\t 73307232 73307233\n", + "{'Q.START': 1038, 'Q.END': 1045, 'T.START': 73307224, 'T.END': 73307231, 'CG': '7='}\n", + "7046573 D134#1#chr03 1046 1080\n", + "\tIn path\n", + "\t 73307234 73307268\n", + "{'Q.START': 1045, 'Q.END': 1046, 'T.START': 73307232, 'T.END': 73307233, 'CG': '1='}\n", + "7046574 D134#1#chr03 1080 1081\n", + "\tIn path\n", + "\t 73307269 73307270\n", + "{'Q.START': 1046, 'Q.END': 1080, 'T.START': 73307234, 'T.END': 73307268, 'CG': '34='}\n", + "7046576 D134#1#chr03 1081 1107\n", + "\tIn path\n", + "\t 73307271 73307297\n", + "{'Q.START': 1080, 'Q.END': 1081, 'T.START': 73307269, 'T.END': 73307270, 'CG': '1='}\n", + "7046577 D134#1#chr03 1107 1108\n", + "\tNot in path\n", + "7046579 D134#1#chr03 1108 1183\n", + "\tIn path\n", + "\t 73307300 73307375\n", + "{'Q.START': 1081, 'Q.END': 1107, 'T.START': 73307271, 'T.END': 73307297, 'CG': '26='}\n", + "7046581 D134#1#chr03 1183 1186\n", + "\tIn path\n", + "\t 73307376 73307379\n", + "{'Q.START': 1108, 'Q.END': 1183, 'T.START': 73307300, 'T.END': 73307375, 'CG': '75='}\n", + "7046583 D134#1#chr03 1186 1224\n", + "\tNot in path\n", + "7046584 D134#1#chr03 1224 1257\n", + "\tIn path\n", + "\t 73307419 73307452\n", + "{'Q.START': 1183, 'Q.END': 1186, 'T.START': 73307376, 'T.END': 73307379, 'CG': '3='}\n", + "7046586 D134#1#chr03 1257 1289\n", + "\tNot in path\n", + "7046587 D134#1#chr03 1289 1311\n", + "\tIn path\n", + "\t 73307475 73307497\n", + "{'Q.START': 1224, 'Q.END': 1257, 'T.START': 73307419, 'T.END': 73307452, 'CG': '33='}\n", + "7046589 D134#1#chr03 1311 1359\n", + "\tNot in path\n", + "7046590 D134#1#chr03 1359 1382\n", + "\tIn path\n", + "\t 73307546 73307569\n", + "{'Q.START': 1289, 'Q.END': 1311, 'T.START': 73307475, 'T.END': 73307497, 'CG': '22='}\n", + "7046592 D134#1#chr03 1382 1434\n", + "\tNot in path\n", + "7046593 D134#1#chr03 1434 1451\n", + "\tIn path\n", + "\t 73307643 73307660\n", + "{'Q.START': 1359, 'Q.END': 1382, 'T.START': 73307546, 'T.END': 73307569, 'CG': '23='}\n", + "7046594 D134#1#chr03 1451 1531\n", + "\tIn path\n", + "\t 73307661 73307741\n", + "{'Q.START': 1434, 'Q.END': 1451, 'T.START': 73307643, 'T.END': 73307660, 'CG': '17='}\n", + "7046596 D134#1#chr03 1531 1532\n", + "\tNot in path\n", + "7046597 D134#1#chr03 1532 1543\n", + "\tIn path\n", + "\t 73307744 73307755\n", + "{'Q.START': 1451, 'Q.END': 1531, 'T.START': 73307661, 'T.END': 73307741, 'CG': '80='}\n", + "7046599 D134#1#chr03 1543 1544\n", + "\tNot in path\n", + "7046600 D134#1#chr03 1544 1572\n", + "\tIn path\n", + "\t 73307758 73307786\n", + "{'Q.START': 1532, 'Q.END': 1543, 'T.START': 73307744, 'T.END': 73307755, 'CG': '11='}\n", + "7046601 D134#1#chr03 1572 1573\n", + "\tIn path\n", + "\t 73307787 73307788\n", + "{'Q.START': 1544, 'Q.END': 1572, 'T.START': 73307758, 'T.END': 73307786, 'CG': '28='}\n", + "7046603 D134#1#chr03 1573 1587\n", + "\tIn path\n", + "\t 73307789 73307803\n", + "{'Q.START': 1572, 'Q.END': 1573, 'T.START': 73307787, 'T.END': 73307788, 'CG': '1='}\n", + "7046604 D134#1#chr03 1587 1588\n", + "\tNot in path\n", + "7046606 D134#1#chr03 1588 1616\n", + "\tIn path\n", + "\t 73307806 73307834\n", + "{'Q.START': 1573, 'Q.END': 1587, 'T.START': 73307789, 'T.END': 73307803, 'CG': '14='}\n", + "7046608 D134#1#chr03 1616 1617\n", + "\tIn path\n", + "\t 73307835 73307836\n", + "{'Q.START': 1588, 'Q.END': 1616, 'T.START': 73307806, 'T.END': 73307834, 'CG': '28='}\n", + "7046609 D134#1#chr03 1617 1646\n", + "\tIn path\n", + "\t 73307837 73307866\n", + "{'Q.START': 1616, 'Q.END': 1617, 'T.START': 73307835, 'T.END': 73307836, 'CG': '1='}\n", + "7046621 D134#1#chr03 1646 1661\n", + "\tIn path\n", + "\t 73307867 73307882\n", + "{'Q.START': 1617, 'Q.END': 1646, 'T.START': 73307837, 'T.END': 73307866, 'CG': '29='}\n", + "7046622 D134#1#chr03 1661 1673\n", + "\tIn path\n", + "\t 73307883 73307895\n", + "{'Q.START': 1646, 'Q.END': 1661, 'T.START': 73307867, 'T.END': 73307882, 'CG': '15='}\n", + "7046624 D134#1#chr03 1673 1674\n", + "\tIn path\n", + "\t 73307896 73307897\n", + "{'Q.START': 1661, 'Q.END': 1673, 'T.START': 73307883, 'T.END': 73307895, 'CG': '12='}\n", + "7046625 D134#1#chr03 1674 1726\n", + "\tIn path\n", + "\t 73307898 73307950\n", + "{'Q.START': 1673, 'Q.END': 1674, 'T.START': 73307896, 'T.END': 73307897, 'CG': '1='}\n", + "7046626 D134#1#chr03 1726 1727\n", + "\tNot in path\n", + "7046628 D134#1#chr03 1727 1762\n", + "\tIn path\n", + "\t 73307953 73307988\n", + "{'Q.START': 1674, 'Q.END': 1726, 'T.START': 73307898, 'T.END': 73307950, 'CG': '52='}\n", + "7046631 D134#1#chr03 1766 1767\n", + "\tIn path\n", + "\t 73307991 73307992\n", + "{'Q.START': 1727, 'Q.END': 1762, 'T.START': 73307953, 'T.END': 73307988, 'CG': '35='}\n", + "7046673 D134#1#chr03 1765 1766\n", + "\tIn path\n", + "\t 73307993 73307994\n", + "{'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}\n", + "7046631 D134#1#chr03 1766 1767\n", + "\tIn path\n", + "\t 73307991 73307992\n", + "{'Q.START': 1765, 'Q.END': 1766, 'T.START': 73307993, 'T.END': 73307994, 'CG': '1='}\n", + "7046673 D134#1#chr03 1765 1766\n", + "\tIn path\n", + "\t 73307993 73307994\n", + "{'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}\n", + "7046631 D134#1#chr03 1766 1767\n", + "\tIn path\n", + "\t 73307991 73307992\n", + "{'Q.START': 1765, 'Q.END': 1766, 'T.START': 73307993, 'T.END': 73307994, 'CG': '1='}\n", + "7046632 D134#1#chr03 1767 1824\n", + "\tIn path\n", + "\t 73307995 73308052\n", + "{'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}\n", + "7046634 D134#1#chr03 1824 1825\n", + "\tIn path\n", + "\t 73308053 73308054\n", + "{'Q.START': 1767, 'Q.END': 1824, 'T.START': 73307995, 'T.END': 73308052, 'CG': '57='}\n", + "7046635 D134#1#chr03 1825 1975\n", + "\tIn path\n", + "\t 73308055 73308205\n", + "{'Q.START': 1824, 'Q.END': 1825, 'T.START': 73308053, 'T.END': 73308054, 'CG': '1='}\n", + "7046637 D134#1#chr03 1975 1976\n", + "\tNot in path\n", + "7046638 D134#1#chr03 1976 2015\n", + "\tIn path\n", + "\t 73308208 73308247\n", + "{'Q.START': 1825, 'Q.END': 1975, 'T.START': 73308055, 'T.END': 73308205, 'CG': '150='}\n", + "7046639 D134#1#chr03 2015 2016\n", + "\tNot in path\n", + "7046641 D134#1#chr03 2016 2047\n", + "\tIn path\n", + "\t 73308250 73308281\n", + "{'Q.START': 1976, 'Q.END': 2015, 'T.START': 73308208, 'T.END': 73308247, 'CG': '39='}\n", + "7046644 D134#1#chr03 2047 2055\n", + "\tIn path\n", + "\t 73308286 73308294\n", + "{'Q.START': 2016, 'Q.END': 2047, 'T.START': 73308250, 'T.END': 73308281, 'CG': '31='}\n", + "7046646 D134#1#chr03 2055 2056\n", + "\tNot in path\n", + "7046647 D134#1#chr03 2056 2120\n", + "\tIn path\n", + "\t 73308297 73308361\n", + "{'Q.START': 2047, 'Q.END': 2055, 'T.START': 73308286, 'T.END': 73308294, 'CG': '8='}\n", + "7046649 D134#1#chr03 2120 2121\n", + "\tIn path\n", + "\t 73308362 73308363\n", + "{'Q.START': 2056, 'Q.END': 2120, 'T.START': 73308297, 'T.END': 73308361, 'CG': '64='}\n", + "7046650 D134#1#chr03 2121 2157\n", + "\tIn path\n", + "\t 73308364 73308400\n", + "{'Q.START': 2120, 'Q.END': 2121, 'T.START': 73308362, 'T.END': 73308363, 'CG': '1='}\n", + "7046652 D134#1#chr03 2157 2158\n", + "\tNot in path\n", + "7046653 D134#1#chr03 2158 2170\n", + "\tIn path\n", + "\t 73308403 73308415\n", + "{'Q.START': 2121, 'Q.END': 2157, 'T.START': 73308364, 'T.END': 73308400, 'CG': '36='}\n", + "7046654 D134#1#chr03 2170 2171\n", + "\tIn path\n", + "\t 73308416 73308417\n", + "{'Q.START': 2158, 'Q.END': 2170, 'T.START': 73308403, 'T.END': 73308415, 'CG': '12='}\n", + "7046656 D134#1#chr03 2171 2205\n", + "\tIn path\n", + "\t 73308418 73308452\n", + "{'Q.START': 2170, 'Q.END': 2171, 'T.START': 73308416, 'T.END': 73308417, 'CG': '1='}\n", + "7046657 D134#1#chr03 2205 2206\n", + "\tNot in path\n", + "7046659 D134#1#chr03 2206 2344\n", + "\tIn path\n", + "\t 73308455 73308593\n", + "{'Q.START': 2171, 'Q.END': 2205, 'T.START': 73308418, 'T.END': 73308452, 'CG': '34='}\n", + "7046660 D134#1#chr03 2344 2345\n", + "\tNot in path\n", + "7046662 D134#1#chr03 2345 2364\n", + "\tIn path\n", + "\t 73308596 73308615\n", + "{'Q.START': 2206, 'Q.END': 2344, 'T.START': 73308455, 'T.END': 73308593, 'CG': '138='}\n", + "7046663 D134#1#chr03 2364 2383\n", + "\tIn path\n", + "\t 73308616 73308635\n", + "{'Q.START': 2345, 'Q.END': 2364, 'T.START': 73308596, 'T.END': 73308615, 'CG': '19='}\n", + "7046665 D134#1#chr03 2383 2408\n", + "\tIn path\n", + "\t 73308636 73308661\n", + "{'Q.START': 2364, 'Q.END': 2383, 'T.START': 73308616, 'T.END': 73308635, 'CG': '19='}\n", + "7046667 D134#1#chr03 2408 2409\n", + "\tIn path\n", + "\t 73308662 73308663\n", + "{'Q.START': 2383, 'Q.END': 2408, 'T.START': 73308636, 'T.END': 73308661, 'CG': '25='}\n", + "7046668 D134#1#chr03 2409 2441\n", + "\tIn path\n", + "\t 73308664 73308696\n", + "{'Q.START': 2408, 'Q.END': 2409, 'T.START': 73308662, 'T.END': 73308663, 'CG': '1='}\n", + "7046670 D134#1#chr03 2441 2442\n", + "\tIn path\n", + "\t 73308697 73308698\n", + "{'Q.START': 2409, 'Q.END': 2441, 'T.START': 73308664, 'T.END': 73308696, 'CG': '32='}\n", + "7046671 D134#1#chr03 2442 2580\n", + "\tIn path\n", + "\t 73308699 73308837\n", + "{'Q.START': 2441, 'Q.END': 2442, 'T.START': 73308697, 'T.END': 73308698, 'CG': '1='}\n", + "7046674 D134#1#chr03 2582 2583\n", + "\tIn path\n", + "\t 73308838 73308839\n", + "{'Q.START': 2442, 'Q.END': 2580, 'T.START': 73308699, 'T.END': 73308837, 'CG': '138='}\n", + "7046675 D134#1#chr03 2583 2584\n", + "\tIn path\n", + "\t 73308840 73308841\n", + "{'Q.START': 2582, 'Q.END': 2583, 'T.START': 73308838, 'T.END': 73308839, 'CG': '1='}\n", + "7046674 D134#1#chr03 2582 2583\n", + "\tIn path\n", + "\t 73308838 73308839\n", + "{'Q.START': 2583, 'Q.END': 2584, 'T.START': 73308840, 'T.END': 73308841, 'CG': '1='}\n", + "7046675 D134#1#chr03 2583 2584\n", + "\tIn path\n", + "\t 73308840 73308841\n", + "{'Q.START': 2582, 'Q.END': 2583, 'T.START': 73308838, 'T.END': 73308839, 'CG': '1='}\n", + "7046676 D134#1#chr03 2584 2764\n", + "\tIn path\n", + "\t 73308842 73309022\n", + "{'Q.START': 2583, 'Q.END': 2584, 'T.START': 73308840, 'T.END': 73308841, 'CG': '1='}\n", + "7046678 D134#1#chr03 2764 2765\n", + "\tNot in path\n", + "7046679 D134#1#chr03 2765 2797\n", + "\tIn path\n", + "\t 73309025 73309057\n", + "{'Q.START': 2584, 'Q.END': 2764, 'T.START': 73308842, 'T.END': 73309022, 'CG': '180='}\n", + "7046680 D134#1#chr03 2797 2798\n", + "\tNot in path\n", + "7046682 D134#1#chr03 2798 2878\n", + "\tIn path\n", + "\t 73309060 73309140\n", + "{'Q.START': 2765, 'Q.END': 2797, 'T.START': 73309025, 'T.END': 73309057, 'CG': '32='}\n", + "7046684 D134#1#chr03 2878 2879\n", + "\tIn path\n", + "\t 73309141 73309142\n", + "{'Q.START': 2798, 'Q.END': 2878, 'T.START': 73309060, 'T.END': 73309140, 'CG': '80='}\n", + "7046685 D134#1#chr03 2879 2951\n", + "\tIn path\n", + "\t 73309143 73309215\n", + "{'Q.START': 2878, 'Q.END': 2879, 'T.START': 73309141, 'T.END': 73309142, 'CG': '1='}\n", + "7046686 D134#1#chr03 2951 2952\n", + "\tIn path\n", + "\t 73309216 73309217\n", + "{'Q.START': 2879, 'Q.END': 2951, 'T.START': 73309143, 'T.END': 73309215, 'CG': '72='}\n", + "7046688 D134#1#chr03 2952 3002\n", + "\tIn path\n", + "\t 73309218 73309268\n", + "{'Q.START': 2951, 'Q.END': 2952, 'T.START': 73309216, 'T.END': 73309217, 'CG': '1='}\n", + "7046690 D134#1#chr03 3002 3077\n", + "\tIn path\n", + "\t 73309271 73309346\n", + "{'Q.START': 2952, 'Q.END': 3002, 'T.START': 73309218, 'T.END': 73309268, 'CG': '50='}\n", + "7046692 D134#1#chr03 3077 3078\n", + "\tIn path\n", + "\t 73309347 73309348\n", + "{'Q.START': 3002, 'Q.END': 3077, 'T.START': 73309271, 'T.END': 73309346, 'CG': '75='}\n", + "7046693 D134#1#chr03 3078 3093\n", + "\tIn path\n", + "\t 73309349 73309364\n", + "{'Q.START': 3077, 'Q.END': 3078, 'T.START': 73309347, 'T.END': 73309348, 'CG': '1='}\n", + "7046695 D134#1#chr03 3093 3094\n", + "\tNot in path\n", + "7046696 D134#1#chr03 3094 3097\n", + "\tIn path\n", + "\t 73309367 73309370\n", + "{'Q.START': 3078, 'Q.END': 3093, 'T.START': 73309349, 'T.END': 73309364, 'CG': '15='}\n", + "7046698 D134#1#chr03 3097 3140\n", + "\tIn path\n", + "\t 73309371 73309414\n", + "{'Q.START': 3094, 'Q.END': 3097, 'T.START': 73309367, 'T.END': 73309370, 'CG': '3='}\n", + "7046700 D134#1#chr03 3140 3210\n", + "\tIn path\n", + "\t 73309415 73309485\n", + "{'Q.START': 3097, 'Q.END': 3140, 'T.START': 73309371, 'T.END': 73309414, 'CG': '43='}\n", + "7046702 D134#1#chr03 3210 3211\n", + "\tIn path\n", + "\t 73309486 73309487\n", + "{'Q.START': 3140, 'Q.END': 3210, 'T.START': 73309415, 'T.END': 73309485, 'CG': '70='}\n", + "7046703 D134#1#chr03 3211 3229\n", + "\tIn path\n", + "\t 73309488 73309506\n", + "{'Q.START': 3210, 'Q.END': 3211, 'T.START': 73309486, 'T.END': 73309487, 'CG': '1='}\n", + "7046704 D134#1#chr03 3229 3230\n", + "\tIn path\n", + "\t 73309507 73309508\n", + "{'Q.START': 3211, 'Q.END': 3229, 'T.START': 73309488, 'T.END': 73309506, 'CG': '18='}\n", + "7046706 D134#1#chr03 3230 3276\n", + "\tIn path\n", + "\t 73309509 73309555\n", + "{'Q.START': 3229, 'Q.END': 3230, 'T.START': 73309507, 'T.END': 73309508, 'CG': '1='}\n", + "7046707 D134#1#chr03 3276 3277\n", + "\tNot in path\n", + "7046709 D134#1#chr03 3277 3315\n", + "\tIn path\n", + "\t 73309558 73309596\n", + "{'Q.START': 3230, 'Q.END': 3276, 'T.START': 73309509, 'T.END': 73309555, 'CG': '46='}\n", + "7046710 D134#1#chr03 3315 3316\n", + "\tNot in path\n", + "7046712 D134#1#chr03 3316 3322\n", + "\tIn path\n", + "\t 73309599 73309605\n", + "{'Q.START': 3277, 'Q.END': 3315, 'T.START': 73309558, 'T.END': 73309596, 'CG': '38='}\n", + "7046713 D134#1#chr03 3322 3323\n", + "\tNot in path\n", + "7046715 D134#1#chr03 3323 3348\n", + "\tIn path\n", + "\t 73309608 73309633\n", + "{'Q.START': 3316, 'Q.END': 3322, 'T.START': 73309599, 'T.END': 73309605, 'CG': '6='}\n", + "7046718 D134#1#chr03 3352 3353\n", + "\tIn path\n", + "\t 73309634 73309635\n", + "{'Q.START': 3323, 'Q.END': 3348, 'T.START': 73309608, 'T.END': 73309633, 'CG': '25='}\n", + "7046717 D134#1#chr03 3351 3352\n", + "\tIn path\n", + "\t 73309636 73309637\n", + "{'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}\n", + "7046718 D134#1#chr03 3352 3353\n", + "\tIn path\n", + "\t 73309634 73309635\n", + "{'Q.START': 3351, 'Q.END': 3352, 'T.START': 73309636, 'T.END': 73309637, 'CG': '1='}\n", + "7046717 D134#1#chr03 3351 3352\n", + "\tIn path\n", + "\t 73309636 73309637\n", + "{'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}\n", + "7046718 D134#1#chr03 3352 3353\n", + "\tIn path\n", + "\t 73309634 73309635\n", + "{'Q.START': 3351, 'Q.END': 3352, 'T.START': 73309636, 'T.END': 73309637, 'CG': '1='}\n", + "7046720 D134#1#chr03 3353 3354\n", + "\tIn path\n", + "\t 73309638 73309639\n", + "{'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}\n", + "7046722 D134#1#chr03 3354 3356\n", + "\tIn path\n", + "\t 73309640 73309642\n", + "{'Q.START': 3353, 'Q.END': 3354, 'T.START': 73309638, 'T.END': 73309639, 'CG': '1='}\n", + "7046724 D134#1#chr03 3356 3357\n", + "\tNot in path\n", + "7046725 D134#1#chr03 3357 3489\n", + "\tIn path\n", + "\t 73309645 73309777\n", + "{'Q.START': 3354, 'Q.END': 3356, 'T.START': 73309640, 'T.END': 73309642, 'CG': '2='}\n", + "7046727 D134#1#chr03 3489 3490\n", + "\tNot in path\n", + "7046728 D134#1#chr03 3490 3642\n", + "\tIn path\n", + "\t 73309780 73309932\n", + "{'Q.START': 3357, 'Q.END': 3489, 'T.START': 73309645, 'T.END': 73309777, 'CG': '132='}\n", + "7046729 D134#1#chr03 3642 3644\n", + "\tNot in path\n", + "7046730 D134#1#chr03 3644 3685\n", + "\tIn path\n", + "\t 73309933 73309974\n", + "{'Q.START': 3490, 'Q.END': 3642, 'T.START': 73309780, 'T.END': 73309932, 'CG': '152='}\n", + "7046731 D134#1#chr03 3685 3687\n", + "\tNot in path\n", + "7046733 D134#1#chr03 3687 3693\n", + "\tIn path\n", + "\t 73309977 73309983\n", + "{'Q.START': 3644, 'Q.END': 3685, 'T.START': 73309933, 'T.END': 73309974, 'CG': '41='}\n", + "7046735 D134#1#chr03 3693 3694\n", + "\tNot in path\n", + "7046736 D134#1#chr03 3694 3708\n", + "\tIn path\n", + "\t 73309986 73310000\n", + "{'Q.START': 3687, 'Q.END': 3693, 'T.START': 73309977, 'T.END': 73309983, 'CG': '6='}\n", + "7046738 D134#1#chr03 3720 3721\n", + "\tIn path\n", + "\t 73310010 73310011\n", + "{'Q.START': 3694, 'Q.END': 3708, 'T.START': 73309986, 'T.END': 73310000, 'CG': '14='}\n", + "7046739 D134#1#chr03 3721 3722\n", + "\tIn path\n", + "\t 73310003 73310004\n", + "{'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}\n", + "7046740 D134#1#chr03 3716 3720\n", + "\tIn path\n", + "\t 73310005 73310009\n", + "{'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}\n", + "7046738 D134#1#chr03 3720 3721\n", + "\tIn path\n", + "\t 73310010 73310011\n", + "{'Q.START': 3716, 'Q.END': 3720, 'T.START': 73310005, 'T.END': 73310009, 'CG': '4='}\n", + "7046739 D134#1#chr03 3721 3722\n", + "\tIn path\n", + "\t 73310003 73310004\n", + "{'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}\n", + "7046740 D134#1#chr03 3716 3720\n", + "\tIn path\n", + "\t 73310005 73310009\n", + "{'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}\n", + "7046738 D134#1#chr03 3720 3721\n", + "\tIn path\n", + "\t 73310010 73310011\n", + "{'Q.START': 3716, 'Q.END': 3720, 'T.START': 73310005, 'T.END': 73310009, 'CG': '4='}\n", + "7046739 D134#1#chr03 3721 3722\n", + "\tIn path\n", + "\t 73310003 73310004\n", + "{'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}\n", + "7046741 D134#1#chr03 3722 3735\n", + "\tIn path\n", + "\t 73310012 73310045\n", + "{'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}\n", + "ALN_1\n", + "7046526 TO1000#1#chr03 0 77\n", + "\t 64684013 64684090\n", + "skipped\n", + "\n", + "7046528 TO1000#1#chr03 77 82\n", + "\t 64684091 64684096\n", + "{'Q.START': 0, 'Q.END': 77, 'T.START': 64684013, 'T.END': 64684090, 'CG': '77='}\n", + "7046530 TO1000#1#chr03 82 83\n", + "\t 64684097 64684098\n", + "{'Q.START': 77, 'Q.END': 82, 'T.START': 64684091, 'T.END': 64684096, 'CG': '5='}\n", + "7046531 TO1000#1#chr03 83 138\n", + "\t 64684099 64684154\n", + "{'Q.START': 82, 'Q.END': 83, 'T.START': 64684097, 'T.END': 64684098, 'CG': '1='}\n", + "7046532 TO1000#1#chr03 138 139\n", + "\t 64684155 64684156\n", + "{'Q.START': 83, 'Q.END': 138, 'T.START': 64684099, 'T.END': 64684154, 'CG': '55='}\n", + "7046533 TO1000#1#chr03 139 202\n", + "\t 64684157 64684220\n", + "{'Q.START': 138, 'Q.END': 139, 'T.START': 64684155, 'T.END': 64684156, 'CG': '1='}\n", + "7046534 TO1000#1#chr03 202 203\n", + "\t 64684221 64684222\n", + "{'Q.START': 139, 'Q.END': 202, 'T.START': 64684157, 'T.END': 64684220, 'CG': '63='}\n", + "7046536 TO1000#1#chr03 203 379\n", + "\t 64684223 64684399\n", + "{'Q.START': 202, 'Q.END': 203, 'T.START': 64684221, 'T.END': 64684222, 'CG': '1='}\n", + "7046537 TO1000#1#chr03 379 380\n", + "\t 64684400 64684401\n", + "{'Q.START': 203, 'Q.END': 379, 'T.START': 64684223, 'T.END': 64684399, 'CG': '176='}\n", + "7046539 TO1000#1#chr03 380 429\n", + "\t 64684402 64684451\n", + "{'Q.START': 379, 'Q.END': 380, 'T.START': 64684400, 'T.END': 64684401, 'CG': '1='}\n", + "7046541 TO1000#1#chr03 429 430\n", + "\t 64684452 64684453\n", + "{'Q.START': 380, 'Q.END': 429, 'T.START': 64684402, 'T.END': 64684451, 'CG': '49='}\n", + "7046542 TO1000#1#chr03 430 457\n", + "\t 64684454 64684481\n", + "{'Q.START': 429, 'Q.END': 430, 'T.START': 64684452, 'T.END': 64684453, 'CG': '1='}\n", + "7046544 TO1000#1#chr03 457 492\n", + "\t 64684482 64684517\n", + "{'Q.START': 430, 'Q.END': 457, 'T.START': 64684454, 'T.END': 64684481, 'CG': '27='}\n", + "7046546 TO1000#1#chr03 492 494\n", + "\t 64684518 64684520\n", + "{'Q.START': 457, 'Q.END': 492, 'T.START': 64684482, 'T.END': 64684517, 'CG': '35='}\n", + "7046547 TO1000#1#chr03 494 497\n", + "\t 64684521 64684524\n", + "{'Q.START': 492, 'Q.END': 494, 'T.START': 64684518, 'T.END': 64684520, 'CG': '2='}\n", + "7046549 TO1000#1#chr03 497 507\n", + "\t 64684525 64684535\n", + "{'Q.START': 494, 'Q.END': 497, 'T.START': 64684521, 'T.END': 64684524, 'CG': '3='}\n", + "7046551 TO1000#1#chr03 507 508\n", + "\t 64684536 64684537\n", + "{'Q.START': 497, 'Q.END': 507, 'T.START': 64684525, 'T.END': 64684535, 'CG': '10='}\n", + "7046552 TO1000#1#chr03 508 564\n", + "\t 64684538 64684594\n", + "{'Q.START': 507, 'Q.END': 508, 'T.START': 64684536, 'T.END': 64684537, 'CG': '1='}\n", + "7046554 TO1000#1#chr03 564 566\n", + "\t 64684595 64684597\n", + "{'Q.START': 508, 'Q.END': 564, 'T.START': 64684538, 'T.END': 64684594, 'CG': '56='}\n", + "7046556 TO1000#1#chr03 568 569\n", + "\t 64684598 64684599\n", + "{'Q.START': 564, 'Q.END': 566, 'T.START': 64684595, 'T.END': 64684597, 'CG': '2='}\n", + "7046556 TO1000#1#chr03 568 569\n", + "\t 64684598 64684599\n", + "{'Q.START': 568, 'Q.END': 569, 'T.START': 64684598, 'T.END': 64684599, 'CG': '1='}\n", + "7046556 TO1000#1#chr03 568 569\n", + "\t 64684598 64684599\n", + "{'Q.START': 568, 'Q.END': 569, 'T.START': 64684598, 'T.END': 64684599, 'CG': '1='}\n", + "7046557 TO1000#1#chr03 569 824\n", + "\t 64684600 64684855\n", + "{'Q.START': 568, 'Q.END': 569, 'T.START': 64684598, 'T.END': 64684599, 'CG': '1='}\n", + "7046558 TO1000#1#chr03 824 826\n", + "\t 64684856 64684858\n", + "{'Q.START': 569, 'Q.END': 824, 'T.START': 64684600, 'T.END': 64684855, 'CG': '255='}\n", + "7046559 TO1000#1#chr03 826 858\n", + "\t 64684859 64684891\n", + "{'Q.START': 824, 'Q.END': 826, 'T.START': 64684856, 'T.END': 64684858, 'CG': '2='}\n", + "7046560 TO1000#1#chr03 858 859\n", + "\t 64684892 64684893\n", + "{'Q.START': 826, 'Q.END': 858, 'T.START': 64684859, 'T.END': 64684891, 'CG': '32='}\n", + "7046561 TO1000#1#chr03 868 869\n", + "\t 64684894 64684895\n", + "{'Q.START': 858, 'Q.END': 859, 'T.START': 64684892, 'T.END': 64684893, 'CG': '1='}\n", + "7046561 TO1000#1#chr03 868 869\n", + "\t 64684894 64684895\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", + "7046561 TO1000#1#chr03 868 869\n", + "\t 64684894 64684895\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", + "7046561 TO1000#1#chr03 868 869\n", + "\t 64684894 64684895\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", + "7046561 TO1000#1#chr03 868 869\n", + "\t 64684894 64684895\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", + "7046561 TO1000#1#chr03 868 869\n", + "\t 64684894 64684895\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", + "7046561 TO1000#1#chr03 868 869\n", + "\t 64684894 64684895\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", + "7046561 TO1000#1#chr03 868 869\n", + "\t 64684894 64684895\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", + "7046561 TO1000#1#chr03 868 869\n", + "\t 64684894 64684895\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", + "7046561 TO1000#1#chr03 868 869\n", + "\t 64684894 64684895\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", + "7046562 TO1000#1#chr03 869 913\n", + "\t 64684896 64684940\n", + "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", + "7046564 TO1000#1#chr03 913 919\n", + "\t 64684941 64684947\n", + "{'Q.START': 869, 'Q.END': 913, 'T.START': 64684896, 'T.END': 64684940, 'CG': '44='}\n", + "7046565 TO1000#1#chr03 919 978\n", + "\t 64684948 64685007\n", + "{'Q.START': 913, 'Q.END': 919, 'T.START': 64684941, 'T.END': 64684947, 'CG': '6='}\n", + "7046567 TO1000#1#chr03 978 979\n", + "\t 64685008 64685009\n", + "{'Q.START': 919, 'Q.END': 978, 'T.START': 64684948, 'T.END': 64685007, 'CG': '59='}\n", + "7046568 TO1000#1#chr03 979 1038\n", + "\t 64685010 64685069\n", + "{'Q.START': 978, 'Q.END': 979, 'T.START': 64685008, 'T.END': 64685009, 'CG': '1='}\n", + "7046570 TO1000#1#chr03 1038 1045\n", + "\t 64685070 64685077\n", + "{'Q.START': 979, 'Q.END': 1038, 'T.START': 64685010, 'T.END': 64685069, 'CG': '59='}\n", + "7046571 TO1000#1#chr03 1045 1046\n", + "\t 64685078 64685079\n", + "{'Q.START': 1038, 'Q.END': 1045, 'T.START': 64685070, 'T.END': 64685077, 'CG': '7='}\n", + "7046573 TO1000#1#chr03 1046 1080\n", + "\t 64685080 64685114\n", + "{'Q.START': 1045, 'Q.END': 1046, 'T.START': 64685078, 'T.END': 64685079, 'CG': '1='}\n", + "7046574 TO1000#1#chr03 1080 1081\n", + "\t 64685115 64685116\n", + "{'Q.START': 1046, 'Q.END': 1080, 'T.START': 64685080, 'T.END': 64685114, 'CG': '34='}\n", + "7046576 TO1000#1#chr03 1081 1107\n", + "\t 64685117 64685143\n", + "{'Q.START': 1080, 'Q.END': 1081, 'T.START': 64685115, 'T.END': 64685116, 'CG': '1='}\n", + "7046577 TO1000#1#chr03 1107 1108\n", + "\t 64685144 64685145\n", + "{'Q.START': 1081, 'Q.END': 1107, 'T.START': 64685117, 'T.END': 64685143, 'CG': '26='}\n", + "7046579 TO1000#1#chr03 1108 1183\n", + "\t 64685146 64685221\n", + "{'Q.START': 1107, 'Q.END': 1108, 'T.START': 64685144, 'T.END': 64685145, 'CG': '1='}\n", + "7046581 TO1000#1#chr03 1183 1186\n", + "\t 64685222 64685225\n", + "{'Q.START': 1108, 'Q.END': 1183, 'T.START': 64685146, 'T.END': 64685221, 'CG': '75='}\n", + "7046583 TO1000#1#chr03 1186 1224\n", + "\t 64685226 64685264\n", + "{'Q.START': 1183, 'Q.END': 1186, 'T.START': 64685222, 'T.END': 64685225, 'CG': '3='}\n", + "7046584 TO1000#1#chr03 1224 1257\n", + "\t 64685265 64685298\n", + "{'Q.START': 1186, 'Q.END': 1224, 'T.START': 64685226, 'T.END': 64685264, 'CG': '38='}\n", + "7046586 TO1000#1#chr03 1257 1289\n", + "\t 64685299 64685331\n", + "{'Q.START': 1224, 'Q.END': 1257, 'T.START': 64685265, 'T.END': 64685298, 'CG': '33='}\n", + "7046587 TO1000#1#chr03 1289 1311\n", + "\t 64685332 64685354\n", + "{'Q.START': 1257, 'Q.END': 1289, 'T.START': 64685299, 'T.END': 64685331, 'CG': '32='}\n", + "7046589 TO1000#1#chr03 1311 1359\n", + "\t 64685355 64685403\n", + "{'Q.START': 1289, 'Q.END': 1311, 'T.START': 64685332, 'T.END': 64685354, 'CG': '22='}\n", + "7046590 TO1000#1#chr03 1359 1382\n", + "\t 64685404 64685427\n", + "{'Q.START': 1311, 'Q.END': 1359, 'T.START': 64685355, 'T.END': 64685403, 'CG': '48='}\n", + "7046592 TO1000#1#chr03 1382 1434\n", + "\t 64685428 64685480\n", + "{'Q.START': 1359, 'Q.END': 1382, 'T.START': 64685404, 'T.END': 64685427, 'CG': '23='}\n", + "7046593 TO1000#1#chr03 1434 1451\n", + "\t 64685481 64685498\n", + "{'Q.START': 1382, 'Q.END': 1434, 'T.START': 64685428, 'T.END': 64685480, 'CG': '52='}\n", + "7046594 TO1000#1#chr03 1451 1531\n", + "\t 64685499 64685579\n", + "{'Q.START': 1434, 'Q.END': 1451, 'T.START': 64685481, 'T.END': 64685498, 'CG': '17='}\n", + "7046596 TO1000#1#chr03 1531 1532\n", + "\t 64685580 64685581\n", + "{'Q.START': 1451, 'Q.END': 1531, 'T.START': 64685499, 'T.END': 64685579, 'CG': '80='}\n", + "7046597 TO1000#1#chr03 1532 1543\n", + "\t 64685582 64685593\n", + "{'Q.START': 1531, 'Q.END': 1532, 'T.START': 64685580, 'T.END': 64685581, 'CG': '1='}\n", + "7046599 TO1000#1#chr03 1543 1544\n", + "\t 64685594 64685595\n", + "{'Q.START': 1532, 'Q.END': 1543, 'T.START': 64685582, 'T.END': 64685593, 'CG': '11='}\n", + "7046600 TO1000#1#chr03 1544 1572\n", + "\t 64685596 64685624\n", + "{'Q.START': 1543, 'Q.END': 1544, 'T.START': 64685594, 'T.END': 64685595, 'CG': '1='}\n", + "7046601 TO1000#1#chr03 1572 1573\n", + "\t 64685625 64685626\n", + "{'Q.START': 1544, 'Q.END': 1572, 'T.START': 64685596, 'T.END': 64685624, 'CG': '28='}\n", + "7046603 TO1000#1#chr03 1573 1587\n", + "\t 64685627 64685641\n", + "{'Q.START': 1572, 'Q.END': 1573, 'T.START': 64685625, 'T.END': 64685626, 'CG': '1='}\n", + "7046604 TO1000#1#chr03 1587 1588\n", + "\t 64685642 64685643\n", + "{'Q.START': 1573, 'Q.END': 1587, 'T.START': 64685627, 'T.END': 64685641, 'CG': '14='}\n", + "7046606 TO1000#1#chr03 1588 1616\n", + "\t 64685644 64685672\n", + "{'Q.START': 1587, 'Q.END': 1588, 'T.START': 64685642, 'T.END': 64685643, 'CG': '1='}\n", + "7046608 TO1000#1#chr03 1616 1617\n", + "\t 64685673 64685674\n", + "{'Q.START': 1588, 'Q.END': 1616, 'T.START': 64685644, 'T.END': 64685672, 'CG': '28='}\n", + "7046609 TO1000#1#chr03 1617 1646\n", + "\t 64685675 64685704\n", + "{'Q.START': 1616, 'Q.END': 1617, 'T.START': 64685673, 'T.END': 64685674, 'CG': '1='}\n", + "7046621 TO1000#1#chr03 1646 1661\n", + "\t 64685705 64685720\n", + "{'Q.START': 1617, 'Q.END': 1646, 'T.START': 64685675, 'T.END': 64685704, 'CG': '29='}\n", + "7046622 TO1000#1#chr03 1661 1673\n", + "\t 64685721 64685733\n", + "{'Q.START': 1646, 'Q.END': 1661, 'T.START': 64685705, 'T.END': 64685720, 'CG': '15='}\n", + "7046624 TO1000#1#chr03 1673 1674\n", + "\t 64685734 64685735\n", + "{'Q.START': 1661, 'Q.END': 1673, 'T.START': 64685721, 'T.END': 64685733, 'CG': '12='}\n", + "7046625 TO1000#1#chr03 1674 1726\n", + "\t 64685736 64685788\n", + "{'Q.START': 1673, 'Q.END': 1674, 'T.START': 64685734, 'T.END': 64685735, 'CG': '1='}\n", + "7046626 TO1000#1#chr03 1726 1727\n", + "\t 64685789 64685790\n", + "{'Q.START': 1674, 'Q.END': 1726, 'T.START': 64685736, 'T.END': 64685788, 'CG': '52='}\n", + "7046628 TO1000#1#chr03 1727 1762\n", + "\t 64685791 64685826\n", + "{'Q.START': 1726, 'Q.END': 1727, 'T.START': 64685789, 'T.END': 64685790, 'CG': '1='}\n", + "7046631 TO1000#1#chr03 1766 1767\n", + "\t 64685827 64685828\n", + "{'Q.START': 1727, 'Q.END': 1762, 'T.START': 64685791, 'T.END': 64685826, 'CG': '35='}\n", + "7046673 TO1000#1#chr03 1765 1766\n", + "\t 64685829 64685830\n", + "{'Q.START': 1766, 'Q.END': 1767, 'T.START': 64685827, 'T.END': 64685828, 'CG': '1='}\n", + "7046631 TO1000#1#chr03 1766 1767\n", + "\t 64685827 64685828\n", + "{'Q.START': 1765, 'Q.END': 1766, 'T.START': 64685829, 'T.END': 64685830, 'CG': '1='}\n", + "7046673 TO1000#1#chr03 1765 1766\n", + "\t 64685829 64685830\n", + "{'Q.START': 1766, 'Q.END': 1767, 'T.START': 64685827, 'T.END': 64685828, 'CG': '1='}\n", + "7046631 TO1000#1#chr03 1766 1767\n", + "\t 64685827 64685828\n", + "{'Q.START': 1765, 'Q.END': 1766, 'T.START': 64685829, 'T.END': 64685830, 'CG': '1='}\n", + "7046632 TO1000#1#chr03 1767 1824\n", + "\t 64685831 64685888\n", + "{'Q.START': 1766, 'Q.END': 1767, 'T.START': 64685827, 'T.END': 64685828, 'CG': '1='}\n", + "7046634 TO1000#1#chr03 1824 1825\n", + "\t 64685889 64685890\n", + "{'Q.START': 1767, 'Q.END': 1824, 'T.START': 64685831, 'T.END': 64685888, 'CG': '57='}\n", + "7046635 TO1000#1#chr03 1825 1975\n", + "\t 64685891 64686041\n", + "{'Q.START': 1824, 'Q.END': 1825, 'T.START': 64685889, 'T.END': 64685890, 'CG': '1='}\n", + "7046637 TO1000#1#chr03 1975 1976\n", + "\t 64686042 64686043\n", + "{'Q.START': 1825, 'Q.END': 1975, 'T.START': 64685891, 'T.END': 64686041, 'CG': '150='}\n", + "7046638 TO1000#1#chr03 1976 2015\n", + "\t 64686044 64686083\n", + "{'Q.START': 1975, 'Q.END': 1976, 'T.START': 64686042, 'T.END': 64686043, 'CG': '1='}\n", + "7046639 TO1000#1#chr03 2015 2016\n", + "\t 64686084 64686085\n", + "{'Q.START': 1976, 'Q.END': 2015, 'T.START': 64686044, 'T.END': 64686083, 'CG': '39='}\n", + "7046641 TO1000#1#chr03 2016 2047\n", + "\t 64686086 64686117\n", + "{'Q.START': 2015, 'Q.END': 2016, 'T.START': 64686084, 'T.END': 64686085, 'CG': '1='}\n", + "7046644 TO1000#1#chr03 2047 2055\n", + "\t 64686118 64686126\n", + "{'Q.START': 2016, 'Q.END': 2047, 'T.START': 64686086, 'T.END': 64686117, 'CG': '31='}\n", + "7046646 TO1000#1#chr03 2055 2056\n", + "\t 64686127 64686128\n", + "{'Q.START': 2047, 'Q.END': 2055, 'T.START': 64686118, 'T.END': 64686126, 'CG': '8='}\n", + "7046647 TO1000#1#chr03 2056 2120\n", + "\t 64686129 64686193\n", + "{'Q.START': 2055, 'Q.END': 2056, 'T.START': 64686127, 'T.END': 64686128, 'CG': '1='}\n", + "7046649 TO1000#1#chr03 2120 2121\n", + "\t 64686194 64686195\n", + "{'Q.START': 2056, 'Q.END': 2120, 'T.START': 64686129, 'T.END': 64686193, 'CG': '64='}\n", + "7046650 TO1000#1#chr03 2121 2157\n", + "\t 64686196 64686232\n", + "{'Q.START': 2120, 'Q.END': 2121, 'T.START': 64686194, 'T.END': 64686195, 'CG': '1='}\n", + "7046652 TO1000#1#chr03 2157 2158\n", + "\t 64686233 64686234\n", + "{'Q.START': 2121, 'Q.END': 2157, 'T.START': 64686196, 'T.END': 64686232, 'CG': '36='}\n", + "7046653 TO1000#1#chr03 2158 2170\n", + "\t 64686235 64686247\n", + "{'Q.START': 2157, 'Q.END': 2158, 'T.START': 64686233, 'T.END': 64686234, 'CG': '1='}\n", + "7046654 TO1000#1#chr03 2170 2171\n", + "\t 64686248 64686249\n", + "{'Q.START': 2158, 'Q.END': 2170, 'T.START': 64686235, 'T.END': 64686247, 'CG': '12='}\n", + "7046656 TO1000#1#chr03 2171 2205\n", + "\t 64686250 64686284\n", + "{'Q.START': 2170, 'Q.END': 2171, 'T.START': 64686248, 'T.END': 64686249, 'CG': '1='}\n", + "7046657 TO1000#1#chr03 2205 2206\n", + "\t 64686285 64686286\n", + "{'Q.START': 2171, 'Q.END': 2205, 'T.START': 64686250, 'T.END': 64686284, 'CG': '34='}\n", + "7046659 TO1000#1#chr03 2206 2344\n", + "\t 64686287 64686425\n", + "{'Q.START': 2205, 'Q.END': 2206, 'T.START': 64686285, 'T.END': 64686286, 'CG': '1='}\n", + "7046660 TO1000#1#chr03 2344 2345\n", + "\t 64686426 64686427\n", + "{'Q.START': 2206, 'Q.END': 2344, 'T.START': 64686287, 'T.END': 64686425, 'CG': '138='}\n", + "7046662 TO1000#1#chr03 2345 2364\n", + "\t 64686428 64686447\n", + "{'Q.START': 2344, 'Q.END': 2345, 'T.START': 64686426, 'T.END': 64686427, 'CG': '1='}\n", + "7046663 TO1000#1#chr03 2364 2383\n", + "\t 64686448 64686467\n", + "{'Q.START': 2345, 'Q.END': 2364, 'T.START': 64686428, 'T.END': 64686447, 'CG': '19='}\n", + "7046665 TO1000#1#chr03 2383 2408\n", + "\t 64686468 64686493\n", + "{'Q.START': 2364, 'Q.END': 2383, 'T.START': 64686448, 'T.END': 64686467, 'CG': '19='}\n", + "7046667 TO1000#1#chr03 2408 2409\n", + "\t 64686494 64686495\n", + "{'Q.START': 2383, 'Q.END': 2408, 'T.START': 64686468, 'T.END': 64686493, 'CG': '25='}\n", + "7046668 TO1000#1#chr03 2409 2441\n", + "\t 64686496 64686528\n", + "{'Q.START': 2408, 'Q.END': 2409, 'T.START': 64686494, 'T.END': 64686495, 'CG': '1='}\n", + "7046670 TO1000#1#chr03 2441 2442\n", + "\t 64686529 64686530\n", + "{'Q.START': 2409, 'Q.END': 2441, 'T.START': 64686496, 'T.END': 64686528, 'CG': '32='}\n", + "7046671 TO1000#1#chr03 2442 2580\n", + "\t 64686531 64686669\n", + "{'Q.START': 2441, 'Q.END': 2442, 'T.START': 64686529, 'T.END': 64686530, 'CG': '1='}\n", + "7046674 TO1000#1#chr03 2582 2583\n", + "\t 64686670 64686671\n", + "{'Q.START': 2442, 'Q.END': 2580, 'T.START': 64686531, 'T.END': 64686669, 'CG': '138='}\n", + "7046675 TO1000#1#chr03 2583 2584\n", + "\t 64686672 64686673\n", + "{'Q.START': 2582, 'Q.END': 2583, 'T.START': 64686670, 'T.END': 64686671, 'CG': '1='}\n", + "7046674 TO1000#1#chr03 2582 2583\n", + "\t 64686670 64686671\n", + "{'Q.START': 2583, 'Q.END': 2584, 'T.START': 64686672, 'T.END': 64686673, 'CG': '1='}\n", + "7046675 TO1000#1#chr03 2583 2584\n", + "\t 64686672 64686673\n", + "{'Q.START': 2582, 'Q.END': 2583, 'T.START': 64686670, 'T.END': 64686671, 'CG': '1='}\n", + "7046676 TO1000#1#chr03 2584 2764\n", + "\t 64686674 64686854\n", + "{'Q.START': 2583, 'Q.END': 2584, 'T.START': 64686672, 'T.END': 64686673, 'CG': '1='}\n", + "7046678 TO1000#1#chr03 2764 2765\n", + "\t 64686855 64686856\n", + "{'Q.START': 2584, 'Q.END': 2764, 'T.START': 64686674, 'T.END': 64686854, 'CG': '180='}\n", + "7046679 TO1000#1#chr03 2765 2797\n", + "\t 64686857 64686889\n", + "{'Q.START': 2764, 'Q.END': 2765, 'T.START': 64686855, 'T.END': 64686856, 'CG': '1='}\n", + "7046680 TO1000#1#chr03 2797 2798\n", + "\t 64686890 64686891\n", + "{'Q.START': 2765, 'Q.END': 2797, 'T.START': 64686857, 'T.END': 64686889, 'CG': '32='}\n", + "7046682 TO1000#1#chr03 2798 2878\n", + "\t 64686892 64686972\n", + "{'Q.START': 2797, 'Q.END': 2798, 'T.START': 64686890, 'T.END': 64686891, 'CG': '1='}\n", + "7046684 TO1000#1#chr03 2878 2879\n", + "\t 64686973 64686974\n", + "{'Q.START': 2798, 'Q.END': 2878, 'T.START': 64686892, 'T.END': 64686972, 'CG': '80='}\n", + "7046685 TO1000#1#chr03 2879 2951\n", + "\t 64686975 64687047\n", + "{'Q.START': 2878, 'Q.END': 2879, 'T.START': 64686973, 'T.END': 64686974, 'CG': '1='}\n", + "7046686 TO1000#1#chr03 2951 2952\n", + "\t 64687048 64687049\n", + "{'Q.START': 2879, 'Q.END': 2951, 'T.START': 64686975, 'T.END': 64687047, 'CG': '72='}\n", + "7046688 TO1000#1#chr03 2952 3002\n", + "\t 64687050 64687100\n", + "{'Q.START': 2951, 'Q.END': 2952, 'T.START': 64687048, 'T.END': 64687049, 'CG': '1='}\n", + "7046690 TO1000#1#chr03 3002 3077\n", + "\t 64687101 64687176\n", + "{'Q.START': 2952, 'Q.END': 3002, 'T.START': 64687050, 'T.END': 64687100, 'CG': '50='}\n", + "7046692 TO1000#1#chr03 3077 3078\n", + "\t 64687177 64687178\n", + "{'Q.START': 3002, 'Q.END': 3077, 'T.START': 64687101, 'T.END': 64687176, 'CG': '75='}\n", + "7046693 TO1000#1#chr03 3078 3093\n", + "\t 64687179 64687194\n", + "{'Q.START': 3077, 'Q.END': 3078, 'T.START': 64687177, 'T.END': 64687178, 'CG': '1='}\n", + "7046695 TO1000#1#chr03 3093 3094\n", + "\t 64687195 64687196\n", + "{'Q.START': 3078, 'Q.END': 3093, 'T.START': 64687179, 'T.END': 64687194, 'CG': '15='}\n", + "7046696 TO1000#1#chr03 3094 3097\n", + "\t 64687197 64687200\n", + "{'Q.START': 3093, 'Q.END': 3094, 'T.START': 64687195, 'T.END': 64687196, 'CG': '1='}\n", + "7046698 TO1000#1#chr03 3097 3140\n", + "\t 64687201 64687244\n", + "{'Q.START': 3094, 'Q.END': 3097, 'T.START': 64687197, 'T.END': 64687200, 'CG': '3='}\n", + "7046700 TO1000#1#chr03 3140 3210\n", + "\t 64687245 64687315\n", + "{'Q.START': 3097, 'Q.END': 3140, 'T.START': 64687201, 'T.END': 64687244, 'CG': '43='}\n", + "7046702 TO1000#1#chr03 3210 3211\n", + "\t 64687316 64687317\n", + "{'Q.START': 3140, 'Q.END': 3210, 'T.START': 64687245, 'T.END': 64687315, 'CG': '70='}\n", + "7046703 TO1000#1#chr03 3211 3229\n", + "\t 64687318 64687336\n", + "{'Q.START': 3210, 'Q.END': 3211, 'T.START': 64687316, 'T.END': 64687317, 'CG': '1='}\n", + "7046704 TO1000#1#chr03 3229 3230\n", + "\t 64687337 64687338\n", + "{'Q.START': 3211, 'Q.END': 3229, 'T.START': 64687318, 'T.END': 64687336, 'CG': '18='}\n", + "7046706 TO1000#1#chr03 3230 3276\n", + "\t 64687339 64687385\n", + "{'Q.START': 3229, 'Q.END': 3230, 'T.START': 64687337, 'T.END': 64687338, 'CG': '1='}\n", + "7046707 TO1000#1#chr03 3276 3277\n", + "\t 64687386 64687387\n", + "{'Q.START': 3230, 'Q.END': 3276, 'T.START': 64687339, 'T.END': 64687385, 'CG': '46='}\n", + "7046709 TO1000#1#chr03 3277 3315\n", + "\t 64687388 64687426\n", + "{'Q.START': 3276, 'Q.END': 3277, 'T.START': 64687386, 'T.END': 64687387, 'CG': '1='}\n", + "7046710 TO1000#1#chr03 3315 3316\n", + "\t 64687427 64687428\n", + "{'Q.START': 3277, 'Q.END': 3315, 'T.START': 64687388, 'T.END': 64687426, 'CG': '38='}\n", + "7046712 TO1000#1#chr03 3316 3322\n", + "\t 64687429 64687435\n", + "{'Q.START': 3315, 'Q.END': 3316, 'T.START': 64687427, 'T.END': 64687428, 'CG': '1='}\n", + "7046713 TO1000#1#chr03 3322 3323\n", + "\t 64687436 64687437\n", + "{'Q.START': 3316, 'Q.END': 3322, 'T.START': 64687429, 'T.END': 64687435, 'CG': '6='}\n", + "7046715 TO1000#1#chr03 3323 3348\n", + "\t 64687438 64687463\n", + "{'Q.START': 3322, 'Q.END': 3323, 'T.START': 64687436, 'T.END': 64687437, 'CG': '1='}\n", + "7046718 TO1000#1#chr03 3352 3353\n", + "\t 64687464 64687465\n", + "{'Q.START': 3323, 'Q.END': 3348, 'T.START': 64687438, 'T.END': 64687463, 'CG': '25='}\n", + "7046717 TO1000#1#chr03 3351 3352\n", + "\t 64687466 64687467\n", + "{'Q.START': 3352, 'Q.END': 3353, 'T.START': 64687464, 'T.END': 64687465, 'CG': '1='}\n", + "7046718 TO1000#1#chr03 3352 3353\n", + "\t 64687464 64687465\n", + "{'Q.START': 3351, 'Q.END': 3352, 'T.START': 64687466, 'T.END': 64687467, 'CG': '1='}\n", + "7046717 TO1000#1#chr03 3351 3352\n", + "\t 64687466 64687467\n", + "{'Q.START': 3352, 'Q.END': 3353, 'T.START': 64687464, 'T.END': 64687465, 'CG': '1='}\n", + "7046718 TO1000#1#chr03 3352 3353\n", + "\t 64687464 64687465\n", + "{'Q.START': 3351, 'Q.END': 3352, 'T.START': 64687466, 'T.END': 64687467, 'CG': '1='}\n", + "7046720 TO1000#1#chr03 3353 3354\n", + "\t 64687468 64687469\n", + "{'Q.START': 3352, 'Q.END': 3353, 'T.START': 64687464, 'T.END': 64687465, 'CG': '1='}\n", + "7046722 TO1000#1#chr03 3354 3356\n", + "\t 64687470 64687472\n", + "{'Q.START': 3353, 'Q.END': 3354, 'T.START': 64687468, 'T.END': 64687469, 'CG': '1='}\n", + "7046724 TO1000#1#chr03 3356 3357\n", + "\t 64687473 64687474\n", + "{'Q.START': 3354, 'Q.END': 3356, 'T.START': 64687470, 'T.END': 64687472, 'CG': '2='}\n", + "7046725 TO1000#1#chr03 3357 3489\n", + "\t 64687475 64687607\n", + "{'Q.START': 3356, 'Q.END': 3357, 'T.START': 64687473, 'T.END': 64687474, 'CG': '1='}\n", + "7046727 TO1000#1#chr03 3489 3490\n", + "\t 64687608 64687609\n", + "{'Q.START': 3357, 'Q.END': 3489, 'T.START': 64687475, 'T.END': 64687607, 'CG': '132='}\n", + "7046728 TO1000#1#chr03 3490 3642\n", + "\t 64687610 64687762\n", + "{'Q.START': 3489, 'Q.END': 3490, 'T.START': 64687608, 'T.END': 64687609, 'CG': '1='}\n", + "7046729 TO1000#1#chr03 3642 3644\n", + "\t 64687763 64687765\n", + "{'Q.START': 3490, 'Q.END': 3642, 'T.START': 64687610, 'T.END': 64687762, 'CG': '152='}\n", + "7046730 TO1000#1#chr03 3644 3685\n", + "\t 64687766 64687807\n", + "{'Q.START': 3642, 'Q.END': 3644, 'T.START': 64687763, 'T.END': 64687765, 'CG': '2='}\n", + "7046731 TO1000#1#chr03 3685 3687\n", + "\t 64687808 64687810\n", + "{'Q.START': 3644, 'Q.END': 3685, 'T.START': 64687766, 'T.END': 64687807, 'CG': '41='}\n", + "7046733 TO1000#1#chr03 3687 3693\n", + "\t 64687811 64687817\n", + "{'Q.START': 3685, 'Q.END': 3687, 'T.START': 64687808, 'T.END': 64687810, 'CG': '2='}\n", + "7046735 TO1000#1#chr03 3693 3694\n", + "\t 64687818 64687819\n", + "{'Q.START': 3687, 'Q.END': 3693, 'T.START': 64687811, 'T.END': 64687817, 'CG': '6='}\n", + "7046736 TO1000#1#chr03 3694 3708\n", + "\t 64687820 64687834\n", + "{'Q.START': 3693, 'Q.END': 3694, 'T.START': 64687818, 'T.END': 64687819, 'CG': '1='}\n", + "7046738 TO1000#1#chr03 3720 3721\n", + "\t 64687835 64687836\n", + "{'Q.START': 3694, 'Q.END': 3708, 'T.START': 64687820, 'T.END': 64687834, 'CG': '14='}\n", + "7046739 TO1000#1#chr03 3721 3722\n", + "\t 64687837 64687838\n", + "{'Q.START': 3720, 'Q.END': 3721, 'T.START': 64687835, 'T.END': 64687836, 'CG': '1='}\n", + "7046740 TO1000#1#chr03 3716 3720\n", + "\t 64687839 64687843\n", + "{'Q.START': 3721, 'Q.END': 3722, 'T.START': 64687837, 'T.END': 64687838, 'CG': '1='}\n", + "7046738 TO1000#1#chr03 3720 3721\n", + "\t 64687835 64687836\n", + "{'Q.START': 3716, 'Q.END': 3720, 'T.START': 64687839, 'T.END': 64687843, 'CG': '4='}\n", + "7046739 TO1000#1#chr03 3721 3722\n", + "\t 64687837 64687838\n", + "{'Q.START': 3720, 'Q.END': 3721, 'T.START': 64687835, 'T.END': 64687836, 'CG': '1='}\n", + "7046740 TO1000#1#chr03 3716 3720\n", + "\t 64687839 64687843\n", + "{'Q.START': 3721, 'Q.END': 3722, 'T.START': 64687837, 'T.END': 64687838, 'CG': '1='}\n", + "7046738 TO1000#1#chr03 3720 3721\n", + "\t 64687835 64687836\n", + "{'Q.START': 3716, 'Q.END': 3720, 'T.START': 64687839, 'T.END': 64687843, 'CG': '4='}\n", + "7046739 TO1000#1#chr03 3721 3722\n", + "\t 64687837 64687838\n", + "{'Q.START': 3720, 'Q.END': 3721, 'T.START': 64687835, 'T.END': 64687836, 'CG': '1='}\n", + "7046741 TO1000#1#chr03 3722 3735\n", + "\t 64687844 64687877\n", + "{'Q.START': 3721, 'Q.END': 3722, 'T.START': 64687837, 'T.END': 64687838, 'CG': '1='}\n", + "ALN_2\n", + "7594382 D134#1#chr03 0 1\n", + "\tIn path\n", + "\t 70220037 70220038\n", + "skipped\n", + "\n", + "7594369 D134#1#chr03 32 33\n", + "\tIn path\n", + "\t 70219216 70219217\n", + "{'Q.START': 0, 'Q.END': 1, 'T.START': 70220037, 'T.END': 70220038, 'CG': '1='}\n", + "7594371 D134#1#chr03 15 16\n", + "\tIn path\n", + "\t 70221163 70221164\n", + "{'Q.START': 32, 'Q.END': 33, 'T.START': 70219216, 'T.END': 70219217, 'CG': '1='}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 15, 'Q.END': 16, 'T.START': 70221163, 'T.END': 70221164, 'CG': '1='}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594356 D134#1#chr03 66 67\n", + "\tIn path\n", + "\t 70219570 70219571\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 66, 'Q.END': 67, 'T.START': 70219570, 'T.END': 70219571, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594375 D134#1#chr03 68 69\n", + "\tIn path\n", + "\t 70221598 70221599\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594626 D134#1#chr03 10 11\n", + "\tIn path\n", + "\t 70219214 70219215\n", + "{'Q.START': 68, 'Q.END': 69, 'T.START': 70221598, 'T.END': 70221599, 'CG': '1='}\n", + "7594011 D134#1#chr03 11 12\n", + "\tIn path\n", + "\t 70219995 70219996\n", + "{'Q.START': 10, 'Q.END': 11, 'T.START': 70219214, 'T.END': 70219215, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 11, 'Q.END': 12, 'T.START': 70219995, 'T.END': 70219996, 'CG': '1='}\n", + "7594375 D134#1#chr03 68 69\n", + "\tIn path\n", + "\t 70221598 70221599\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594369 D134#1#chr03 32 33\n", + "\tIn path\n", + "\t 70219216 70219217\n", + "{'Q.START': 68, 'Q.END': 69, 'T.START': 70221598, 'T.END': 70221599, 'CG': '1='}\n", + "7594371 D134#1#chr03 15 16\n", + "\tIn path\n", + "\t 70221163 70221164\n", + "{'Q.START': 32, 'Q.END': 33, 'T.START': 70219216, 'T.END': 70219217, 'CG': '1='}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 15, 'Q.END': 16, 'T.START': 70221163, 'T.END': 70221164, 'CG': '1='}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594241 D134#1#chr03 20 21\n", + "\tIn path\n", + "\t 70219220 70219221\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594248 D134#1#chr03 21 22\n", + "\tNot in path\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 20, 'Q.END': 21, 'T.START': 70219220, 'T.END': 70219221, 'CG': '1='}\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594315 D134#1#chr03 53 54\n", + "\tIn path\n", + "\t 70219857 70219858\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 53, 'Q.END': 54, 'T.START': 70219857, 'T.END': 70219858, 'CG': '1='}\n", + "7594330 D134#1#chr03 26 27\n", + "\tNot in path\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594315 D134#1#chr03 53 54\n", + "\tIn path\n", + "\t 70219857 70219858\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 53, 'Q.END': 54, 'T.START': 70219857, 'T.END': 70219858, 'CG': '1='}\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594369 D134#1#chr03 32 33\n", + "\tIn path\n", + "\t 70219216 70219217\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 32, 'Q.END': 33, 'T.START': 70219216, 'T.END': 70219217, 'CG': '1='}\n", + "7594026 D134#1#chr03 37 38\n", + "\tIn path\n", + "\t 70220249 70220250\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 37, 'Q.END': 38, 'T.START': 70220249, 'T.END': 70220250, 'CG': '1='}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594026 D134#1#chr03 37 38\n", + "\tIn path\n", + "\t 70220249 70220250\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 37, 'Q.END': 38, 'T.START': 70220249, 'T.END': 70220250, 'CG': '1='}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594315 D134#1#chr03 53 54\n", + "\tIn path\n", + "\t 70219857 70219858\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 53, 'Q.END': 54, 'T.START': 70219857, 'T.END': 70219858, 'CG': '1='}\n", + "7594311 D134#1#chr03 55 56\n", + "\tIn path\n", + "\t 70219351 70219352\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", + "7594021 D134#1#chr03 57 58\n", + "\tIn path\n", + "\t 70219218 70219219\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", + "7594286 D134#1#chr03 59 60\n", + "\tIn path\n", + "\t 70219349 70219350\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", + "7594356 D134#1#chr03 66 67\n", + "\tIn path\n", + "\t 70219570 70219571\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 66, 'Q.END': 67, 'T.START': 70219570, 'T.END': 70219571, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594375 D134#1#chr03 68 69\n", + "\tIn path\n", + "\t 70221598 70221599\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 68, 'Q.END': 69, 'T.START': 70221598, 'T.END': 70221599, 'CG': '1='}\n", + "7594356 D134#1#chr03 66 67\n", + "\tIn path\n", + "\t 70219570 70219571\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 66, 'Q.END': 67, 'T.START': 70219570, 'T.END': 70219571, 'CG': '1='}\n", + "7594375 D134#1#chr03 68 69\n", + "\tIn path\n", + "\t 70221598 70221599\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594374 D134#1#chr03 69 70\n", + "\tIn path\n", + "\t 70219092 70219093\n", + "{'Q.START': 68, 'Q.END': 69, 'T.START': 70221598, 'T.END': 70221599, 'CG': '1='}\n", + "7594350 D134#1#chr03 70 71\n", + "\tIn path\n", + "\t 70219226 70219227\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", + "7594264 D134#1#chr03 71 72\n", + "\tIn path\n", + "\t 70219228 70219229\n", + "{'Q.START': 70, 'Q.END': 71, 'T.START': 70219226, 'T.END': 70219227, 'CG': '1='}\n", + "7594207 D134#1#chr03 72 73\n", + "\tIn path\n", + "\t 70219230 70219231\n", + "{'Q.START': 71, 'Q.END': 72, 'T.START': 70219228, 'T.END': 70219229, 'CG': '1='}\n", + "7594225 D134#1#chr03 73 74\n", + "\tIn path\n", + "\t 70219232 70219233\n", + "{'Q.START': 72, 'Q.END': 73, 'T.START': 70219230, 'T.END': 70219231, 'CG': '1='}\n", + "7594227 D134#1#chr03 74 75\n", + "\tIn path\n", + "\t 70220150 70220151\n", + "{'Q.START': 73, 'Q.END': 74, 'T.START': 70219232, 'T.END': 70219233, 'CG': '1='}\n", + "7594120 D134#1#chr03 75 76\n", + "\tIn path\n", + "\t 70219236 70219237\n", + "{'Q.START': 74, 'Q.END': 75, 'T.START': 70220150, 'T.END': 70220151, 'CG': '1='}\n", + "7594132 D134#1#chr03 76 77\n", + "\tIn path\n", + "\t 70219777 70219778\n", + "{'Q.START': 75, 'Q.END': 76, 'T.START': 70219236, 'T.END': 70219237, 'CG': '1='}\n", + "7594165 D134#1#chr03 77 78\n", + "\tIn path\n", + "\t 70219240 70219241\n", + "{'Q.START': 76, 'Q.END': 77, 'T.START': 70219777, 'T.END': 70219778, 'CG': '1='}\n", + "7594172 D134#1#chr03 78 3735\n", + "\tNot in path\n", + "ALN_2\n", + "7594382 TO1000#1#chr03 0 1\n", + "\t 61731222 61731223\n", + "skipped\n", + "\n", + "7594369 TO1000#1#chr03 32 33\n", + "\t 61731060 61731061\n", + "{'Q.START': 0, 'Q.END': 1, 'T.START': 61731222, 'T.END': 61731223, 'CG': '1='}\n", + "7594371 TO1000#1#chr03 15 16\n", + "\tNot in path\n", + "7594021 TO1000#1#chr03 57 58\n", + "\t 61730922 61730923\n", + "{'Q.START': 32, 'Q.END': 33, 'T.START': 61731060, 'T.END': 61731061, 'CG': '1='}\n", + "7594286 TO1000#1#chr03 59 60\n", + "\t 61731054 61731055\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", + "7594374 TO1000#1#chr03 69 70\n", + "\t 61730920 61730921\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", + "7594356 TO1000#1#chr03 66 67\n", + "\t 61731519 61731520\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", + "7594374 TO1000#1#chr03 69 70\n", + "\t 61730920 61730921\n", + "{'Q.START': 66, 'Q.END': 67, 'T.START': 61731519, 'T.END': 61731520, 'CG': '1='}\n", + "7594374 TO1000#1#chr03 69 70\n", + "\t 61730920 61730921\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", + "7594375 TO1000#1#chr03 68 69\n", + "\t 61733612 61733613\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", + "7594626 TO1000#1#chr03 10 11\n", + "\t 61731056 61731057\n", + "{'Q.START': 68, 'Q.END': 69, 'T.START': 61733612, 'T.END': 61733613, 'CG': '1='}\n", + "7594011 TO1000#1#chr03 11 12\n", + "\t 61733900 61733901\n", + "{'Q.START': 10, 'Q.END': 11, 'T.START': 61731056, 'T.END': 61731057, 'CG': '1='}\n", + "7594374 TO1000#1#chr03 69 70\n", + "\t 61730920 61730921\n", + "{'Q.START': 11, 'Q.END': 12, 'T.START': 61733900, 'T.END': 61733901, 'CG': '1='}\n", + "7594375 TO1000#1#chr03 68 69\n", + "\t 61733612 61733613\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", + "7594369 TO1000#1#chr03 32 33\n", + "\t 61731060 61731061\n", + "{'Q.START': 68, 'Q.END': 69, 'T.START': 61733612, 'T.END': 61733613, 'CG': '1='}\n", + "7594371 TO1000#1#chr03 15 16\n", + "\tNot in path\n", + "7594021 TO1000#1#chr03 57 58\n", + "\t 61730922 61730923\n", + "{'Q.START': 32, 'Q.END': 33, 'T.START': 61731060, 'T.END': 61731061, 'CG': '1='}\n", + "7594021 TO1000#1#chr03 57 58\n", + "\t 61730922 61730923\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", + "7594021 TO1000#1#chr03 57 58\n", + "\t 61730922 61730923\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", + "7594021 TO1000#1#chr03 57 58\n", + "\t 61730922 61730923\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", + "7594241 TO1000#1#chr03 20 21\n", + "\t 61731046 61731047\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", + "7594248 TO1000#1#chr03 21 22\n", + "\t 61734261 61734262\n", + "{'Q.START': 20, 'Q.END': 21, 'T.START': 61731046, 'T.END': 61731047, 'CG': '1='}\n", + "7594286 TO1000#1#chr03 59 60\n", + "\t 61731054 61731055\n", + "{'Q.START': 21, 'Q.END': 22, 'T.START': 61734261, 'T.END': 61734262, 'CG': '1='}\n", + "7594311 TO1000#1#chr03 55 56\n", + "\t 61731052 61731053\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", + "7594315 TO1000#1#chr03 53 54\n", + "\t 61733937 61733938\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", + "7594311 TO1000#1#chr03 55 56\n", + "\t 61731052 61731053\n", + "{'Q.START': 53, 'Q.END': 54, 'T.START': 61733937, 'T.END': 61733938, 'CG': '1='}\n", + "7594330 TO1000#1#chr03 26 27\n", + "\t 61731768 61731769\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", + "7594311 TO1000#1#chr03 55 56\n", + "\t 61731052 61731053\n", + "{'Q.START': 26, 'Q.END': 27, 'T.START': 61731768, 'T.END': 61731769, 'CG': '1='}\n", + "7594315 TO1000#1#chr03 53 54\n", + "\t 61733937 61733938\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", + "7594374 TO1000#1#chr03 69 70\n", + "\t 61730920 61730921\n", + "{'Q.START': 53, 'Q.END': 54, 'T.START': 61733937, 'T.END': 61733938, 'CG': '1='}\n", + "7594311 TO1000#1#chr03 55 56\n", + "\t 61731052 61731053\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", + "7594374 TO1000#1#chr03 69 70\n", + "\t 61730920 61730921\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", + "7594369 TO1000#1#chr03 32 33\n", + "\t 61731060 61731061\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", + "7594021 TO1000#1#chr03 57 58\n", + "\t 61730922 61730923\n", + "{'Q.START': 32, 'Q.END': 33, 'T.START': 61731060, 'T.END': 61731061, 'CG': '1='}\n", + "7594026 TO1000#1#chr03 37 38\n", + "\t 61734267 61734268\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", + "7594021 TO1000#1#chr03 57 58\n", + "\t 61730922 61730923\n", + "{'Q.START': 37, 'Q.END': 38, 'T.START': 61734267, 'T.END': 61734268, 'CG': '1='}\n", + "7594021 TO1000#1#chr03 57 58\n", + "\t 61730922 61730923\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", + "7594026 TO1000#1#chr03 37 38\n", + "\t 61734267 61734268\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", + "7594021 TO1000#1#chr03 57 58\n", + "\t 61730922 61730923\n", + "{'Q.START': 37, 'Q.END': 38, 'T.START': 61734267, 'T.END': 61734268, 'CG': '1='}\n", + "7594021 TO1000#1#chr03 57 58\n", + "\t 61730922 61730923\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", + "7594021 TO1000#1#chr03 57 58\n", + "\t 61730922 61730923\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", + "7594021 TO1000#1#chr03 57 58\n", + "\t 61730922 61730923\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", + "7594286 TO1000#1#chr03 59 60\n", + "\t 61731054 61731055\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", + "7594374 TO1000#1#chr03 69 70\n", + "\t 61730920 61730921\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", + "7594021 TO1000#1#chr03 57 58\n", + "\t 61730922 61730923\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", + "7594286 TO1000#1#chr03 59 60\n", + "\t 61731054 61731055\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", + "7594311 TO1000#1#chr03 55 56\n", + "\t 61731052 61731053\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", + "7594286 TO1000#1#chr03 59 60\n", + "\t 61731054 61731055\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", + "7594311 TO1000#1#chr03 55 56\n", + "\t 61731052 61731053\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", + "7594286 TO1000#1#chr03 59 60\n", + "\t 61731054 61731055\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", + "7594311 TO1000#1#chr03 55 56\n", + "\t 61731052 61731053\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", + "7594286 TO1000#1#chr03 59 60\n", + "\t 61731054 61731055\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", + "7594311 TO1000#1#chr03 55 56\n", + "\t 61731052 61731053\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", + "7594315 TO1000#1#chr03 53 54\n", + "\t 61733937 61733938\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", + "7594286 TO1000#1#chr03 59 60\n", + "\t 61731054 61731055\n", + "{'Q.START': 53, 'Q.END': 54, 'T.START': 61733937, 'T.END': 61733938, 'CG': '1='}\n", + "7594311 TO1000#1#chr03 55 56\n", + "\t 61731052 61731053\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", + "7594374 TO1000#1#chr03 69 70\n", + "\t 61730920 61730921\n", + "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", + "7594021 TO1000#1#chr03 57 58\n", + "\t 61730922 61730923\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", + "7594286 TO1000#1#chr03 59 60\n", + "\t 61731054 61731055\n", + "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", + "7594286 TO1000#1#chr03 59 60\n", + "\t 61731054 61731055\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", + "7594374 TO1000#1#chr03 69 70\n", + "\t 61730920 61730921\n", + "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", + "7594356 TO1000#1#chr03 66 67\n", + "\t 61731519 61731520\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", + "7594374 TO1000#1#chr03 69 70\n", + "\t 61730920 61730921\n", + "{'Q.START': 66, 'Q.END': 67, 'T.START': 61731519, 'T.END': 61731520, 'CG': '1='}\n", + "7594374 TO1000#1#chr03 69 70\n", + "\t 61730920 61730921\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", + "7594375 TO1000#1#chr03 68 69\n", + "\t 61733612 61733613\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", + "7594374 TO1000#1#chr03 69 70\n", + "\t 61730920 61730921\n", + "{'Q.START': 68, 'Q.END': 69, 'T.START': 61733612, 'T.END': 61733613, 'CG': '1='}\n", + "7594356 TO1000#1#chr03 66 67\n", + "\t 61731519 61731520\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", + "7594374 TO1000#1#chr03 69 70\n", + "\t 61730920 61730921\n", + "{'Q.START': 66, 'Q.END': 67, 'T.START': 61731519, 'T.END': 61731520, 'CG': '1='}\n", + "7594375 TO1000#1#chr03 68 69\n", + "\t 61733612 61733613\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", + "7594374 TO1000#1#chr03 69 70\n", + "\t 61730920 61730921\n", + "{'Q.START': 68, 'Q.END': 69, 'T.START': 61733612, 'T.END': 61733613, 'CG': '1='}\n", + "7594350 TO1000#1#chr03 70 71\n", + "\t 61731066 61731067\n", + "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", + "7594264 TO1000#1#chr03 71 72\n", + "\t 61731068 61731069\n", + "{'Q.START': 70, 'Q.END': 71, 'T.START': 61731066, 'T.END': 61731067, 'CG': '1='}\n", + "7594207 TO1000#1#chr03 72 73\n", + "\t 61731070 61731071\n", + "{'Q.START': 71, 'Q.END': 72, 'T.START': 61731068, 'T.END': 61731069, 'CG': '1='}\n", + "7594225 TO1000#1#chr03 73 74\n", + "\t 61731072 61731073\n", + "{'Q.START': 72, 'Q.END': 73, 'T.START': 61731070, 'T.END': 61731071, 'CG': '1='}\n", + "7594227 TO1000#1#chr03 74 75\n", + "\tNot in path\n", + "7594120 TO1000#1#chr03 75 76\n", + "\t 61731076 61731077\n", + "{'Q.START': 73, 'Q.END': 74, 'T.START': 61731072, 'T.END': 61731073, 'CG': '1='}\n", + "7594132 TO1000#1#chr03 76 77\n", + "\t 61733800 61733801\n", + "{'Q.START': 75, 'Q.END': 76, 'T.START': 61731076, 'T.END': 61731077, 'CG': '1='}\n", + "7594165 TO1000#1#chr03 77 78\n", + "\t 61731080 61731081\n", + "{'Q.START': 76, 'Q.END': 77, 'T.START': 61733800, 'T.END': 61733801, 'CG': '1='}\n", + "7594172 TO1000#1#chr03 78 3735\n", + "\tNot in path\n" + ] + } + ], + "source": [ + "ALNS = {}\n", + "## Iterating over alignments\n", + "for aln_name in aln_dict.keys():\n", + " \n", + " ## Iterating over paths of the gfa\n", + " for path_name in paths.keys():\n", + " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(aln_name)\n", + " _ = [] # Temporary list holding alignment blocks\n", + "\n", + " ## Iterating over alignment nodes of the current alignment\n", + " for node_id, orient in aln_dict[aln_name][\"PATH.MATCH\"]:\n", + "\n", + " # Getting node info\n", + " n_info = nodes[node_id]\n", + " q_start = n_info[aln_name][\"START\"] # Start position on the query\n", + " q_end = n_info[aln_name][\"END\"] # End position on the query\n", + " _CG = n_info[aln_name][\"CIGAR\"] # Cigar of the alignment on the current node\n", + "\n", + " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(node_id, path_name, q_start, q_end)\n", + "\n", + " ## Checking if path is traversing the current node\n", + " if path_name in list(n_info.keys()):\n", + " if path_name == \"D134#1#chr03\": print(\"\\tIn path\")\n", + "\n", + " ## Getting start and end position on the target given the orientation of the node in the alignment and the path\n", + " if n_info[aln_name][\"STRAND\"] == n_info[path_name][\"STRAND\"] :\n", + " t_start = n_info[path_name][\"START\"]+n_info[aln_name][\"S.OFF\"]\n", + " t_end = n_info[path_name][\"END\"]+n_info[aln_name][\"E.OFF\"] \n", + " else :\n", + " t_end = n_info[path_name][\"START\"]+n_info[aln_name][\"S.OFF\"]\n", + " t_start = n_info[path_name][\"END\"]+n_info[aln_name][\"E.OFF\"]\n", + "\n", + " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(\"\\t\", t_start, t_end)\n", + "\n", + " \"\"\"\n", + " If the latest block t.end and q.end matches with the current node t.start and q.start, \n", + " the node should be added to the block. Else, we terminate the block and add the node to a new block\n", + " \"\"\"\n", + " \n", + " # Non empty temporary list of aln and ending of the last block is the same as the start of the new node : \n", + " if len(_) and _[-1][\"T.END\"] == t_start and _[-1][\"Q.END\"]+1 == q_start: \n", + " tmp_aln[\"Q.END\"] = q_end\n", + " tmp_aln[\"T.END\"] = t_end\n", + " tmp_aln[\"CG\"] += _CG\n", + "# elif len(_) and _[-1][\"T.END\"] == t_start: # Following on the target not on the query (i.e. Insertion)\n", + "# tmp_aln[\"T.END\"] = t_end\n", + "# tmp_aln[\"CG\"] += f\"{nodes_length[node_id]}I\"\n", + "# elif len(_) and _[-1][\"Q.END\"]+1 == q_start: # Following on the query, not on the target (i.e. Deletion)\n", + "# tmp_aln[\"Q.END\"] = q_end\n", + "# tmp_aln[\"CG\"] += f\"{nodes_length[node_id]}D\"\n", + " else : # Else, completely different\n", + " try : \n", + " _.append(tmp_aln)\n", + " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(tmp_aln)\n", + " except : \n", + " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(\"skipped\\n\")\n", + " tmp_aln = {\n", + " \"Q.START\": q_start,\n", + " \"Q.END\": q_end,\n", + " \"T.START\": t_start,\n", + " \"T.END\": t_end,\n", + " \"CG\": _CG,\n", + " }\n", + " \n", + " else : \n", + " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(\"\\tNot in path\")\n", + " # Node is not in the path\n", + "\n", + " del tmp_aln\n", + " \n", + " ALNS[(path_name, aln_name)] = _" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "547f03fa-cbd5-42f9-b668-1ca4404795ba", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'Q.START': 0, 'Q.END': 77, 'T.START': 73306158, 'T.END': 73306235, 'CG': '77='}, {'Q.START': 77, 'Q.END': 82, 'T.START': 73306238, 'T.END': 73306243, 'CG': '5='}, {'Q.START': 83, 'Q.END': 138, 'T.START': 73306246, 'T.END': 73306301, 'CG': '55='}, {'Q.START': 139, 'Q.END': 202, 'T.START': 73306302, 'T.END': 73306365, 'CG': '63='}, {'Q.START': 202, 'Q.END': 203, 'T.START': 73306366, 'T.END': 73306367, 'CG': '1='}, {'Q.START': 203, 'Q.END': 379, 'T.START': 73306368, 'T.END': 73306544, 'CG': '176='}, {'Q.START': 379, 'Q.END': 380, 'T.START': 73306545, 'T.END': 73306546, 'CG': '1='}, {'Q.START': 380, 'Q.END': 429, 'T.START': 73306547, 'T.END': 73306596, 'CG': '49='}, {'Q.START': 429, 'Q.END': 430, 'T.START': 73306597, 'T.END': 73306598, 'CG': '1='}, {'Q.START': 430, 'Q.END': 457, 'T.START': 73306599, 'T.END': 73306626, 'CG': '27='}, {'Q.START': 457, 'Q.END': 492, 'T.START': 73306641, 'T.END': 73306676, 'CG': '35='}, {'Q.START': 508, 'Q.END': 564, 'T.START': 73306694, 'T.END': 73306750, 'CG': '56='}, {'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}, {'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}, {'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}, {'Q.START': 569, 'Q.END': 824, 'T.START': 73306755, 'T.END': 73307010, 'CG': '255='}, {'Q.START': 826, 'Q.END': 858, 'T.START': 73307011, 'T.END': 73307043, 'CG': '32='}, {'Q.START': 858, 'Q.END': 859, 'T.START': 73307044, 'T.END': 73307045, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 869, 'Q.END': 913, 'T.START': 73307048, 'T.END': 73307092, 'CG': '44='}, {'Q.START': 913, 'Q.END': 919, 'T.START': 73307093, 'T.END': 73307099, 'CG': '6='}, {'Q.START': 919, 'Q.END': 978, 'T.START': 73307100, 'T.END': 73307159, 'CG': '59='}, {'Q.START': 978, 'Q.END': 979, 'T.START': 73307160, 'T.END': 73307161, 'CG': '1='}, {'Q.START': 979, 'Q.END': 1038, 'T.START': 73307162, 'T.END': 73307221, 'CG': '59='}, {'Q.START': 1038, 'Q.END': 1045, 'T.START': 73307224, 'T.END': 73307231, 'CG': '7='}, {'Q.START': 1045, 'Q.END': 1046, 'T.START': 73307232, 'T.END': 73307233, 'CG': '1='}, {'Q.START': 1046, 'Q.END': 1080, 'T.START': 73307234, 'T.END': 73307268, 'CG': '34='}, {'Q.START': 1080, 'Q.END': 1081, 'T.START': 73307269, 'T.END': 73307270, 'CG': '1='}, {'Q.START': 1081, 'Q.END': 1107, 'T.START': 73307271, 'T.END': 73307297, 'CG': '26='}, {'Q.START': 1108, 'Q.END': 1183, 'T.START': 73307300, 'T.END': 73307375, 'CG': '75='}, {'Q.START': 1183, 'Q.END': 1186, 'T.START': 73307376, 'T.END': 73307379, 'CG': '3='}, {'Q.START': 1224, 'Q.END': 1257, 'T.START': 73307419, 'T.END': 73307452, 'CG': '33='}, {'Q.START': 1289, 'Q.END': 1311, 'T.START': 73307475, 'T.END': 73307497, 'CG': '22='}, {'Q.START': 1359, 'Q.END': 1382, 'T.START': 73307546, 'T.END': 73307569, 'CG': '23='}, {'Q.START': 1434, 'Q.END': 1451, 'T.START': 73307643, 'T.END': 73307660, 'CG': '17='}, {'Q.START': 1451, 'Q.END': 1531, 'T.START': 73307661, 'T.END': 73307741, 'CG': '80='}, {'Q.START': 1532, 'Q.END': 1543, 'T.START': 73307744, 'T.END': 73307755, 'CG': '11='}, {'Q.START': 1544, 'Q.END': 1572, 'T.START': 73307758, 'T.END': 73307786, 'CG': '28='}, {'Q.START': 1572, 'Q.END': 1573, 'T.START': 73307787, 'T.END': 73307788, 'CG': '1='}, {'Q.START': 1573, 'Q.END': 1587, 'T.START': 73307789, 'T.END': 73307803, 'CG': '14='}, {'Q.START': 1588, 'Q.END': 1616, 'T.START': 73307806, 'T.END': 73307834, 'CG': '28='}, {'Q.START': 1616, 'Q.END': 1617, 'T.START': 73307835, 'T.END': 73307836, 'CG': '1='}, {'Q.START': 1617, 'Q.END': 1646, 'T.START': 73307837, 'T.END': 73307866, 'CG': '29='}, {'Q.START': 1646, 'Q.END': 1661, 'T.START': 73307867, 'T.END': 73307882, 'CG': '15='}, {'Q.START': 1661, 'Q.END': 1673, 'T.START': 73307883, 'T.END': 73307895, 'CG': '12='}, {'Q.START': 1673, 'Q.END': 1674, 'T.START': 73307896, 'T.END': 73307897, 'CG': '1='}, {'Q.START': 1674, 'Q.END': 1726, 'T.START': 73307898, 'T.END': 73307950, 'CG': '52='}, {'Q.START': 1727, 'Q.END': 1762, 'T.START': 73307953, 'T.END': 73307988, 'CG': '35='}, {'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}, {'Q.START': 1765, 'Q.END': 1766, 'T.START': 73307993, 'T.END': 73307994, 'CG': '1='}, {'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}, {'Q.START': 1765, 'Q.END': 1766, 'T.START': 73307993, 'T.END': 73307994, 'CG': '1='}, {'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}, {'Q.START': 1767, 'Q.END': 1824, 'T.START': 73307995, 'T.END': 73308052, 'CG': '57='}, {'Q.START': 1824, 'Q.END': 1825, 'T.START': 73308053, 'T.END': 73308054, 'CG': '1='}, {'Q.START': 1825, 'Q.END': 1975, 'T.START': 73308055, 'T.END': 73308205, 'CG': '150='}, {'Q.START': 1976, 'Q.END': 2015, 'T.START': 73308208, 'T.END': 73308247, 'CG': '39='}, {'Q.START': 2016, 'Q.END': 2047, 'T.START': 73308250, 'T.END': 73308281, 'CG': '31='}, {'Q.START': 2047, 'Q.END': 2055, 'T.START': 73308286, 'T.END': 73308294, 'CG': '8='}, {'Q.START': 2056, 'Q.END': 2120, 'T.START': 73308297, 'T.END': 73308361, 'CG': '64='}, {'Q.START': 2120, 'Q.END': 2121, 'T.START': 73308362, 'T.END': 73308363, 'CG': '1='}, {'Q.START': 2121, 'Q.END': 2157, 'T.START': 73308364, 'T.END': 73308400, 'CG': '36='}, {'Q.START': 2158, 'Q.END': 2170, 'T.START': 73308403, 'T.END': 73308415, 'CG': '12='}, {'Q.START': 2170, 'Q.END': 2171, 'T.START': 73308416, 'T.END': 73308417, 'CG': '1='}, {'Q.START': 2171, 'Q.END': 2205, 'T.START': 73308418, 'T.END': 73308452, 'CG': '34='}, {'Q.START': 2206, 'Q.END': 2344, 'T.START': 73308455, 'T.END': 73308593, 'CG': '138='}, {'Q.START': 2345, 'Q.END': 2364, 'T.START': 73308596, 'T.END': 73308615, 'CG': '19='}, {'Q.START': 2364, 'Q.END': 2383, 'T.START': 73308616, 'T.END': 73308635, 'CG': '19='}, {'Q.START': 2383, 'Q.END': 2408, 'T.START': 73308636, 'T.END': 73308661, 'CG': '25='}, {'Q.START': 2408, 'Q.END': 2409, 'T.START': 73308662, 'T.END': 73308663, 'CG': '1='}, {'Q.START': 2409, 'Q.END': 2441, 'T.START': 73308664, 'T.END': 73308696, 'CG': '32='}, {'Q.START': 2441, 'Q.END': 2442, 'T.START': 73308697, 'T.END': 73308698, 'CG': '1='}, {'Q.START': 2442, 'Q.END': 2580, 'T.START': 73308699, 'T.END': 73308837, 'CG': '138='}, {'Q.START': 2582, 'Q.END': 2583, 'T.START': 73308838, 'T.END': 73308839, 'CG': '1='}, {'Q.START': 2583, 'Q.END': 2584, 'T.START': 73308840, 'T.END': 73308841, 'CG': '1='}, {'Q.START': 2582, 'Q.END': 2583, 'T.START': 73308838, 'T.END': 73308839, 'CG': '1='}, {'Q.START': 2583, 'Q.END': 2584, 'T.START': 73308840, 'T.END': 73308841, 'CG': '1='}, {'Q.START': 2584, 'Q.END': 2764, 'T.START': 73308842, 'T.END': 73309022, 'CG': '180='}, {'Q.START': 2765, 'Q.END': 2797, 'T.START': 73309025, 'T.END': 73309057, 'CG': '32='}, {'Q.START': 2798, 'Q.END': 2878, 'T.START': 73309060, 'T.END': 73309140, 'CG': '80='}, {'Q.START': 2878, 'Q.END': 2879, 'T.START': 73309141, 'T.END': 73309142, 'CG': '1='}, {'Q.START': 2879, 'Q.END': 2951, 'T.START': 73309143, 'T.END': 73309215, 'CG': '72='}, {'Q.START': 2951, 'Q.END': 2952, 'T.START': 73309216, 'T.END': 73309217, 'CG': '1='}, {'Q.START': 2952, 'Q.END': 3002, 'T.START': 73309218, 'T.END': 73309268, 'CG': '50='}, {'Q.START': 3002, 'Q.END': 3077, 'T.START': 73309271, 'T.END': 73309346, 'CG': '75='}, {'Q.START': 3077, 'Q.END': 3078, 'T.START': 73309347, 'T.END': 73309348, 'CG': '1='}, {'Q.START': 3078, 'Q.END': 3093, 'T.START': 73309349, 'T.END': 73309364, 'CG': '15='}, {'Q.START': 3094, 'Q.END': 3097, 'T.START': 73309367, 'T.END': 73309370, 'CG': '3='}, {'Q.START': 3097, 'Q.END': 3140, 'T.START': 73309371, 'T.END': 73309414, 'CG': '43='}, {'Q.START': 3140, 'Q.END': 3210, 'T.START': 73309415, 'T.END': 73309485, 'CG': '70='}, {'Q.START': 3210, 'Q.END': 3211, 'T.START': 73309486, 'T.END': 73309487, 'CG': '1='}, {'Q.START': 3211, 'Q.END': 3229, 'T.START': 73309488, 'T.END': 73309506, 'CG': '18='}, {'Q.START': 3229, 'Q.END': 3230, 'T.START': 73309507, 'T.END': 73309508, 'CG': '1='}, {'Q.START': 3230, 'Q.END': 3276, 'T.START': 73309509, 'T.END': 73309555, 'CG': '46='}, {'Q.START': 3277, 'Q.END': 3315, 'T.START': 73309558, 'T.END': 73309596, 'CG': '38='}, {'Q.START': 3316, 'Q.END': 3322, 'T.START': 73309599, 'T.END': 73309605, 'CG': '6='}, {'Q.START': 3323, 'Q.END': 3348, 'T.START': 73309608, 'T.END': 73309633, 'CG': '25='}, {'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}, {'Q.START': 3351, 'Q.END': 3352, 'T.START': 73309636, 'T.END': 73309637, 'CG': '1='}, {'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}, {'Q.START': 3351, 'Q.END': 3352, 'T.START': 73309636, 'T.END': 73309637, 'CG': '1='}, {'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}, {'Q.START': 3353, 'Q.END': 3354, 'T.START': 73309638, 'T.END': 73309639, 'CG': '1='}, {'Q.START': 3354, 'Q.END': 3356, 'T.START': 73309640, 'T.END': 73309642, 'CG': '2='}, {'Q.START': 3357, 'Q.END': 3489, 'T.START': 73309645, 'T.END': 73309777, 'CG': '132='}, {'Q.START': 3490, 'Q.END': 3642, 'T.START': 73309780, 'T.END': 73309932, 'CG': '152='}, {'Q.START': 3644, 'Q.END': 3685, 'T.START': 73309933, 'T.END': 73309974, 'CG': '41='}, {'Q.START': 3687, 'Q.END': 3693, 'T.START': 73309977, 'T.END': 73309983, 'CG': '6='}, {'Q.START': 3694, 'Q.END': 3708, 'T.START': 73309986, 'T.END': 73310000, 'CG': '14='}, {'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}, {'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}, {'Q.START': 3716, 'Q.END': 3720, 'T.START': 73310005, 'T.END': 73310009, 'CG': '4='}, {'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}, {'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}, {'Q.START': 3716, 'Q.END': 3720, 'T.START': 73310005, 'T.END': 73310009, 'CG': '4='}, {'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}, {'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}]\n" + ] + } + ], + "source": [ + "print(ALNS[(\"D134#1#chr03\", \"ALN_1\")])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/gaf2aln.py b/gaf2aln.py index 7657046..25fdebf 100644 --- a/gaf2aln.py +++ b/gaf2aln.py @@ -412,40 +412,30 @@ for aln_name in aln_dict.keys(): tmp_aln["CG"] += f"{nodes_length[node_id]}I" elif len(_) and _[-1]["Q.END"] == q_start: # Following on the query, not on the target (i.e. Deletion) tmp_aln["Q.END"] = q_end - tmps_aln["CG"] += f"{nodes_length[node_id]}D" + tmp_aln["CG"] += f"{nodes_length[node_id]}D" else : # Else, completely different + try : + _.append(tmp_aln) + except : pass tmp_aln = { "Q.START": q_start, "Q.END": q_end, "T.START": t_start, "T.END": t_end, "CG": _CG, - } - print("\t", tmp_aln) - + } + else : print("\tNot in path") # Node is not in the path - _.append(tmp_aln) - ALNS[(path_name, aln_name)] = _ - -## Debug -for elem in ALNS[("TO1000#1#chr03", "ALN_1")]: - print(elem) - + ALNS[(path_name, aln_name)] = _ - - - - - - - - - - - - - +## Debug +for elem in ALNS.keys(): + print(elem) + +for key, elem in ALNS.items(): + print(key) + print(elem) \ No newline at end of file -- GitLab From ba8d0c2bf543e8e82785974a8c4da89487304e75 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 3 Jun 2024 13:36:27 +0200 Subject: [PATCH 16/30] Update Anchors2Path.py --- Anchors2Path.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/Anchors2Path.py b/Anchors2Path.py index 3111c6f..e9582f8 100644 --- a/Anchors2Path.py +++ b/Anchors2Path.py @@ -121,12 +121,14 @@ node_path_count = {} ## {<NODE_ID>: <Number of path traversing this node>} # Computing number of path traversing each nodes for path_id, node_list in path_nodes.items(): - for node_id in node_list: + for node_id, counts in np.unique(node_list, return_counts=True): - try : - node_path_count[node_id] += 1 - except : - node_path_count[node_id] = 1 + # Filtering anchors based on appearance (we keep unique anchors) + if counts == 1 : + try : + node_path_count[node_id] += 1 + except : + node_path_count[node_id] = 1 # Searching anchors n_path = len(list(path_nodes.keys())) @@ -137,17 +139,23 @@ for node_id, count in node_path_count.items(): # Computing path position for each node of the path of interest current_pos = 0 +ordered_anchors = [] for node_id in path_nodes[args.pathname]: _end = current_pos + nodes_length[node_id] # Trying to add anchors path position if it is an anchor try : Anchors[int(node_id)].append( (current_pos, _end) ) + + # Keeping track of order of appearance of anchors + ordered_anchors.append(int(node_id)) except: pass current_pos = _end +# Filtering anchors based on + # Transforming data into a table ID, START, END = [], [], [] for node_id, positions in Anchors.items(): -- GitLab From 022035307bd73924f77b2e7b0995bafe03210e30 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 3 Jun 2024 13:42:34 +0200 Subject: [PATCH 17/30] Update Anchors2Path.py --- Anchors2Path.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Anchors2Path.py b/Anchors2Path.py index e9582f8..8a8261d 100644 --- a/Anchors2Path.py +++ b/Anchors2Path.py @@ -121,10 +121,11 @@ node_path_count = {} ## {<NODE_ID>: <Number of path traversing this node>} # Computing number of path traversing each nodes for path_id, node_list in path_nodes.items(): - for node_id, counts in np.unique(node_list, return_counts=True): - + nodes_counts = np.unique(node_list, return_counts=True) + for i in range(len(nodes_counts)): + node_id, count = nodes_counts[0][i], nodes_counts[1][i] # Filtering anchors based on appearance (we keep unique anchors) - if counts == 1 : + if count == 1 : try : node_path_count[node_id] += 1 except : -- GitLab From 8a62837dad2792b87c61deb44cc1927f0dca03a9 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 3 Jun 2024 14:48:26 +0200 Subject: [PATCH 18/30] Update Anchors2Path.py --- Anchors2Path.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Anchors2Path.py b/Anchors2Path.py index 8a8261d..a3df05c 100644 --- a/Anchors2Path.py +++ b/Anchors2Path.py @@ -124,6 +124,7 @@ for path_id, node_list in path_nodes.items(): nodes_counts = np.unique(node_list, return_counts=True) for i in range(len(nodes_counts)): node_id, count = nodes_counts[0][i], nodes_counts[1][i] + print(node_id, count) # Filtering anchors based on appearance (we keep unique anchors) if count == 1 : try : -- GitLab From 1a0dd8fb93808ec443394ae2eaf4eaf50955e5a4 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 3 Jun 2024 15:02:06 +0200 Subject: [PATCH 19/30] Update Anchors2Path.py --- Anchors2Path.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Anchors2Path.py b/Anchors2Path.py index a3df05c..98565e4 100644 --- a/Anchors2Path.py +++ b/Anchors2Path.py @@ -121,11 +121,11 @@ node_path_count = {} ## {<NODE_ID>: <Number of path traversing this node>} # Computing number of path traversing each nodes for path_id, node_list in path_nodes.items(): + print(path_id) nodes_counts = np.unique(node_list, return_counts=True) for i in range(len(nodes_counts)): node_id, count = nodes_counts[0][i], nodes_counts[1][i] - print(node_id, count) - # Filtering anchors based on appearance (we keep unique anchors) + print("\t", node_id, count) if count == 1 : try : node_path_count[node_id] += 1 -- GitLab From 291de812648d640e807678e771f2198153cc64f0 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 3 Jun 2024 15:05:03 +0200 Subject: [PATCH 20/30] Update Anchors2Path.py --- Anchors2Path.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Anchors2Path.py b/Anchors2Path.py index 98565e4..40c119f 100644 --- a/Anchors2Path.py +++ b/Anchors2Path.py @@ -17,7 +17,7 @@ from functools import reduce import concurrent.futures import gzip -version = "0.1" +version = "0.1.1" ## Argument parser arg_parser = argparse.ArgumentParser(description='Anchors2Path') @@ -72,6 +72,8 @@ if args.version: print(version) os._exit(0) +print("Version:\t", version) + # Timing the script start_time = time.time() @@ -121,9 +123,10 @@ node_path_count = {} ## {<NODE_ID>: <Number of path traversing this node>} # Computing number of path traversing each nodes for path_id, node_list in path_nodes.items(): - print(path_id) + nodes_counts = np.unique(node_list, return_counts=True) - for i in range(len(nodes_counts)): + print(path_id) + for i in range(len(nodes_counts[0])): node_id, count = nodes_counts[0][i], nodes_counts[1][i] print("\t", node_id, count) if count == 1 : -- GitLab From 02a3e63c343d0dc0123c49ba5fe5b46dcbbc0aa8 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 3 Jun 2024 15:07:48 +0200 Subject: [PATCH 21/30] Update Anchors2Path.py --- Anchors2Path.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Anchors2Path.py b/Anchors2Path.py index 40c119f..eb325ec 100644 --- a/Anchors2Path.py +++ b/Anchors2Path.py @@ -17,7 +17,7 @@ from functools import reduce import concurrent.futures import gzip -version = "0.1.1" +version = "0.1.2" ## Argument parser arg_parser = argparse.ArgumentParser(description='Anchors2Path') @@ -125,10 +125,10 @@ node_path_count = {} for path_id, node_list in path_nodes.items(): nodes_counts = np.unique(node_list, return_counts=True) - print(path_id) + print(f"[Anchors2Path] Counting nodes in {path_id}") for i in range(len(nodes_counts[0])): node_id, count = nodes_counts[0][i], nodes_counts[1][i] - print("\t", node_id, count) + if count == 1 : try : node_path_count[node_id] += 1 -- GitLab From c4ff0c313ab30d67b4d66086ac6ccad16f99b2ed Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Thu, 27 Jun 2024 17:11:32 +0200 Subject: [PATCH 22/30] Adding UniP --- GFAvc.py | 6 ++---- PanGeTools.def | 4 ++++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/GFAvc.py b/GFAvc.py index 7ac2173..5cf601f 100644 --- a/GFAvc.py +++ b/GFAvc.py @@ -44,6 +44,7 @@ if args.version: with open(args.gfa, 'r') as file: gfa = file.readlines() + # for line in file : ## Changing version number in header assert gfa[0].split('\t')[1] == "VN:Z:1.1" @@ -59,10 +60,7 @@ for lineID in sorted(range(len(gfa)), reverse = True): if gfa[lineID][0] == "W" : - curLine = gfa.pop(lineID).split('\t') - - # Removing '\n' at the end of the line - curLine[-1] = curLine[-1][:-1] + curLine = gfa.pop(lineID).strip().split('\t') # Transforming '>..>..<..>..' to ['>..', '>..', '<..', '>..'] curWalk = re.findall(r'>\w+|<\w+', curLine[-1]) diff --git a/PanGeTools.def b/PanGeTools.def index 41e76ac..0e0a29c 100644 --- a/PanGeTools.def +++ b/PanGeTools.def @@ -209,6 +209,10 @@ Stage: build mv bin/GraphAligner /apps/bin/ cd /apps && rm -rf GraphAligner + # Installing UniP + cd /apps + git clone https://forgemia.inra.fr/alexis.mergez/unip.git UniP + # Cleaning step ## Removing tarballs rm /apps/*.tar* -- GitLab From 734e3396599ba5ab710bfc74ee5232899df1aac8 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Tue, 9 Jul 2024 20:07:20 +0200 Subject: [PATCH 23/30] Linked Unip in container --- PanGeTools.def | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/PanGeTools.def b/PanGeTools.def index 0e0a29c..1f850ac 100644 --- a/PanGeTools.def +++ b/PanGeTools.def @@ -212,11 +212,15 @@ Stage: build # Installing UniP cd /apps git clone https://forgemia.inra.fr/alexis.mergez/unip.git UniP + ln -s /apps/UniP/UniP.py /apps/bin/UniP.py # Cleaning step ## Removing tarballs rm /apps/*.tar* +%apprun unip + exec UniP.py "$@" + %apprun gfaffix exec gfaffix "$@" -- GitLab From 079dbc0ccf7aa9ec634d3dd288e055d2ca064c78 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Tue, 9 Jul 2024 20:07:49 +0200 Subject: [PATCH 24/30] Created Pancat unofficial container --- .gitlab-ci.yml | 30 +++++++++++++++++++++++++++++- Pancat.def | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 Pancat.def diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9068aa4..834c834 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -39,9 +39,19 @@ test_for_changes: else RUN_panache="0"; fi + + # Check block for Pancat.def + - curMd5=$(git show ${curTag}:Pancat.def | md5sum | cut -d' ' -f1) + - prevMd5=$(git show ${prevTag}:Pancat.def | md5sum | cut -d' ' -f1) + - if [ $curMd5 != $prevMd5 ]; then + RUN_Pancat="1"; + else + RUN_Pancat="0"; + fi - echo "RUN_PanGeTools=$RUN_PanGeTools" >> build.env - echo "RUN_panache=$RUN_panache" >> build.env + - echo "RUN_Pancat=$RUN_Pancat" >> build.env artifacts: reports: @@ -87,4 +97,22 @@ build:panache: apptainer push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" panache.sif oras://"$CI_REGISTRY_IMAGE"/panache:latest; fi - +build:Pancat: + stage: build + tags: + - stable # Using Stable runners as test runners don't work properly + needs: + - job: "test_for_changes" + artifacts: true + image: + name: kaczmarj/apptainer:latest + entrypoint: [""] + rules: + - if: $CI_COMMIT_TAG # Run this job when a tag is created + script: + - if [ $RUN_Pancat == "1" ]; then + sed -i "s/VERSION_NUMBER/${CI_COMMIT_TAG#v}/" Pancat.def; + apptainer build Pancat.sif Pancat.def; + apptainer push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" Pancat.sif oras://"$CI_REGISTRY_IMAGE"/pancat:"$CI_COMMIT_TAG"; + apptainer push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" Pancat.sif oras://"$CI_REGISTRY_IMAGE"/pancat:latest; + fi diff --git a/Pancat.def b/Pancat.def new file mode 100644 index 0000000..67aac3d --- /dev/null +++ b/Pancat.def @@ -0,0 +1,36 @@ +Bootstrap: docker +From: ghcr.io/mamba-org/micromamba:latest +Stage: build + +%environment + export MAMBA_DOCKERFILE_ACTIVATE=1 + export PATH="$PATH:/apps/Pancat" + +%post + + export MAMBA_DOCKERFILE_ACTIVATE=1 + apt-get update && apt-get upgrade -y + apt-get install -y git + + # Creating base environment with micromamba + mkdir /apps + micromamba create -y -p /apps/base -c conda-forge -c bioconda \ + python=3.10.* + + git clone https://github.com/Tharos-ux/pancat.git /apps/Pancat + cd /apps/Pancat + micromamba run -p /apps/base pip install -r requirements.txt --upgrade + micromamba run -p /apps/base python -m pip install . --quiet + +%runscript + exec micromamba run -p /apps/base "$@" + +%labels + Author alexis.mergez@inrae.fr + Image.version VERSION_NUMBER + pancat.home https://github.com/Tharos-ux/pancat.git + about.home https://forgemia.inra.fr/alexis.mergez/pan1capps + +%help + Unofficial Apptainer container for Pancat (https://github.com/Tharos-ux/pancat.git). + Image version : VERSION_NUMBER -- GitLab From db6fa3107a42e3fb6204ca7aacb4541878876a22 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Tue, 9 Jul 2024 20:10:51 +0200 Subject: [PATCH 25/30] Tricking CICD --- Pancat.def | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Pancat.def b/Pancat.def index 67aac3d..f87205f 100644 --- a/Pancat.def +++ b/Pancat.def @@ -7,7 +7,6 @@ Stage: build export PATH="$PATH:/apps/Pancat" %post - export MAMBA_DOCKERFILE_ACTIVATE=1 apt-get update && apt-get upgrade -y apt-get install -y git @@ -33,4 +32,4 @@ Stage: build %help Unofficial Apptainer container for Pancat (https://github.com/Tharos-ux/pancat.git). - Image version : VERSION_NUMBER + Image version: VERSION_NUMBER -- GitLab From b20d7852034afeeb4cc03f76ba4f45361623fa05 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 26 Jul 2024 12:00:11 +0200 Subject: [PATCH 26/30] Update Pancat.def --- Pancat.def | 1 + 1 file changed, 1 insertion(+) diff --git a/Pancat.def b/Pancat.def index f87205f..56ba7e0 100644 --- a/Pancat.def +++ b/Pancat.def @@ -33,3 +33,4 @@ Stage: build %help Unofficial Apptainer container for Pancat (https://github.com/Tharos-ux/pancat.git). Image version: VERSION_NUMBER + -- GitLab From d8cfcfcefe79d85b48b6038eb56ca89b2295ae7a Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Thu, 1 Aug 2024 17:59:44 +0200 Subject: [PATCH 27/30] Update PanGeTools.def - Added Seqwish v0.7.9 - Updated VG to v1.58.0 --- PanGeTools.def | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/PanGeTools.def b/PanGeTools.def index 1f850ac..79465d8 100644 --- a/PanGeTools.def +++ b/PanGeTools.def @@ -96,7 +96,7 @@ Stage: build # Installing vg cd /apps/bin - wget --no-check-certificate -c https://github.com/vgteam/vg/releases/download/v1.56.0/vg + wget --no-check-certificate -c https://github.com/vgteam/vg/releases/download/v1.58.0/vg chmod +x vg # Installing panacus @@ -126,6 +126,14 @@ Stage: build cmake -H. -Bbuild && cmake --build build -- -j $(nproc) mv /apps/wfmash-v0.10.5/build/bin/wfmash /apps/bin/wfmash + # Installing seqwish + cd /apps + wget --no-check-certificate https://github.com/ekg/seqwish/releases/download/v0.7.9/seqwish-v0.7.9.tar.gz + tar -zxvf seqwish-v0.7.9.tar.gz + cd seqwish-v0.7.9 + cmake -H. -Bbuild && cmake --build build -- -j $(nproc) + mv /apps/seqwish-v0.7.9/bin/seqwish /apps/bin/seqwish + # Installing bgzip cd /apps wget --no-check-certificate https://github.com/samtools/htslib/releases/download/1.19.1/htslib-1.19.1.tar.bz2 @@ -257,6 +265,9 @@ Stage: build %apprun wfmash exec wfmash "$@" +%apprun seqwish + exec seqwish "$@" + %apprun minimap2 exec minimap2 "$@" @@ -320,13 +331,14 @@ Stage: build GFAffix.Version 0.1.5 smoothxg.Version 0.7.2 Odgi.Version 0.8.6 - vg.Version 1.56.0 + vg.Version 1.58.0 panacus.Version 0.2.3 gfatools.Version 0.5 GFAvc.Version 0.3 GFAstats.Version 0.3.2 Samtools.Version 1.19 wfmash.Version 0.10.5 + seqwish.Version 0.7.9 htslib.Version 1.19.1 minimap2.Version 2.26 vcfbub.Version 0.1.0 @@ -344,7 +356,7 @@ Stage: build - GFAffix v0.1.5 - smoothxg v0.7.2 - odgi v0.8.6 - - vg v1.56.0 + - vg v1.58.0 - panacus v0.2.3 - gfatools v0.5 - GFAvc v0.3 @@ -352,6 +364,7 @@ Stage: build - samtools v1.19 - bgzip v1.19.1 - wfmash v0.10.5 + - seqwish v0.7.9 - minimap2 v2.26 - vcfbub v0.1.0 - vcflib v1.0.9 -- GitLab From faf2580e937979f6e2baaaf1c4ba790bcb64196b Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 2 Aug 2024 16:55:16 +0200 Subject: [PATCH 28/30] Bumped Wfmash version --- .gitignore | 3 +- .gitlab-ci.yml | 119 ++++++++++++++----------------------------------- PanGeTools.def | 12 ++--- 3 files changed, 42 insertions(+), 92 deletions(-) diff --git a/.gitignore b/.gitignore index 1fcf15d..75c3153 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -*.sif \ No newline at end of file +*.sif +PGT-beta.def diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 834c834..40251c1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,118 +1,67 @@ -release_job: - stage: .pre - image: registry.gitlab.com/gitlab-org/release-cli:latest - rules: - - if: $CI_COMMIT_TAG # Run this job when a tag is created - script: - - echo "running release_job" - release: # See https://docs.gitlab.com/ee/ci/yaml/#release for available properties - tag_name: '$CI_COMMIT_TAG' - description: '$CI_COMMIT_TAG' - -test_for_changes: - stage: .pre +Build_PanGeTools: + stage: build tags: - stable # Using Stable runners as test runners don't work properly - image: ubuntu:latest + image: + name: kaczmarj/apptainer:latest + entrypoint: [""] rules: - - if: $CI_COMMIT_TAG + - if: $CI_COMMIT_TAG # Run this job when a tag is created + when: manual script: - - apt update -y && apt install -y git - # Getting current and previous tags - - curTag=$(git describe --abbrev=0 --tags) - - prevTag=$(git describe --abbrev=0 --tags HEAD^) - - # Check block for PanGeTools.def - - curMd5=$(git show ${curTag}:PanGeTools.def | md5sum | cut -d' ' -f1) - - prevMd5=$(git show ${prevTag}:PanGeTools.def | md5sum | cut -d' ' -f1) - - if [ $curMd5 != $prevMd5 ]; then - RUN_PanGeTools="1"; - else - RUN_PanGeTools="0"; - fi - - # Check block for panache.def - - curMd5=$(git show ${curTag}:Panache.def | md5sum | cut -d' ' -f1) - - prevMd5=$(git show ${prevTag}:Panache.def | md5sum | cut -d' ' -f1) - - if [ $curMd5 != $prevMd5 ]; then - RUN_panache="1"; - else - RUN_panache="0"; - fi - - # Check block for Pancat.def - - curMd5=$(git show ${curTag}:Pancat.def | md5sum | cut -d' ' -f1) - - prevMd5=$(git show ${prevTag}:Pancat.def | md5sum | cut -d' ' -f1) - - if [ $curMd5 != $prevMd5 ]; then - RUN_Pancat="1"; - else - RUN_Pancat="0"; - fi - - - echo "RUN_PanGeTools=$RUN_PanGeTools" >> build.env - - echo "RUN_panache=$RUN_panache" >> build.env - - echo "RUN_Pancat=$RUN_Pancat" >> build.env - + - sed -i "s/VERSION_NUMBER/${CI_COMMIT_TAG#v}/" PanGeTools.def + - apptainer build PanGeTools.sif PanGeTools.def + - apptainer push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" PanGeTools.sif oras://"$CI_REGISTRY_IMAGE"/pangetools:"$CI_COMMIT_TAG" artifacts: - reports: - dotenv: build.env + paths: + - PanGeTools.sif + expire_in: 1 week -build:PanGeTools: - stage: build +Latest_PanGeTools: + stage: deploy tags: - stable # Using Stable runners as test runners don't work properly needs: - - job: "test_for_changes" - artifacts: true + - job: "Build_PanGeTools" image: name: kaczmarj/apptainer:latest entrypoint: [""] rules: - - if: $CI_COMMIT_TAG # Run this job when a tag is created + - if: $CI_COMMIT_TAG # Run this job when a tag is created + when: manual script: - - if [ $RUN_PanGeTools == "1" ]; then - sed -i "s/VERSION_NUMBER/${CI_COMMIT_TAG#v}/" PanGeTools.def; - apptainer build PanGeTools.sif PanGeTools.def; - apptainer push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" PanGeTools.sif oras://"$CI_REGISTRY_IMAGE"/pangetools:"$CI_COMMIT_TAG"; - apptainer push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" PanGeTools.sif oras://"$CI_REGISTRY_IMAGE"/pangetools:latest; - fi + - apptainer push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" PanGeTools.sif oras://"$CI_REGISTRY_IMAGE"/pangetools:latest -build:panache: +Build_Pancat: stage: build tags: - stable # Using Stable runners as test runners don't work properly - needs: - - job: "test_for_changes" - artifacts: true image: name: kaczmarj/apptainer:latest entrypoint: [""] rules: - if: $CI_COMMIT_TAG # Run this job when a tag is created + when: manual script: - - if [ $RUN_panache == "1" ]; then - sed -i "s/VERSION_NUMBER/${CI_COMMIT_TAG#v}/" Panache.def; - apptainer build Panache.sif Panache.def; - apptainer push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" panache.sif oras://"$CI_REGISTRY_IMAGE"/panache:"$CI_COMMIT_TAG"; - apptainer push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" panache.sif oras://"$CI_REGISTRY_IMAGE"/panache:latest; - fi + - sed -i "s/VERSION_NUMBER/${CI_COMMIT_TAG#v}/" Pancat.def + - apptainer build Pancat.sif Pancat.def + - apptainer push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" Pancat.sif oras://"$CI_REGISTRY_IMAGE"/pancat:"$CI_COMMIT_TAG" + artifacts: + paths: + - Pancat.sif + expire_in: 1 week -build:Pancat: - stage: build +Latest_Pancat: + stage: deploy tags: - stable # Using Stable runners as test runners don't work properly needs: - - job: "test_for_changes" - artifacts: true + - job: "Build_Pancat" image: name: kaczmarj/apptainer:latest entrypoint: [""] rules: - - if: $CI_COMMIT_TAG # Run this job when a tag is created + - if: $CI_COMMIT_TAG # Run this job when a tag is created + when: manual script: - - if [ $RUN_Pancat == "1" ]; then - sed -i "s/VERSION_NUMBER/${CI_COMMIT_TAG#v}/" Pancat.def; - apptainer build Pancat.sif Pancat.def; - apptainer push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" Pancat.sif oras://"$CI_REGISTRY_IMAGE"/pancat:"$CI_COMMIT_TAG"; - apptainer push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" Pancat.sif oras://"$CI_REGISTRY_IMAGE"/pancat:latest; - fi + - apptainer push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" Pancat.sif oras://"$CI_REGISTRY_IMAGE"/pancat:latest \ No newline at end of file diff --git a/PanGeTools.def b/PanGeTools.def index 79465d8..ecc05f1 100644 --- a/PanGeTools.def +++ b/PanGeTools.def @@ -120,11 +120,11 @@ Stage: build # Installing wfmash cd /apps - wget --no-check-certificate https://github.com/waveygang/wfmash/releases/download/v0.10.5/wfmash-v0.10.5.tar.gz - tar -zxvf wfmash-v0.10.5.tar.gz - cd wfmash-v0.10.5 + wget --no-check-certificate https://github.com/waveygang/wfmash/releases/download/v0.17.0/wfmash-v0.17.0.tar.gz + tar -zxvf wfmash-v0.17.0.tar.gz + cd wfmash-v0.17.0 cmake -H. -Bbuild && cmake --build build -- -j $(nproc) - mv /apps/wfmash-v0.10.5/build/bin/wfmash /apps/bin/wfmash + mv /apps/wfmash-v0.17.0/build/bin/wfmash /apps/bin/wfmash # Installing seqwish cd /apps @@ -337,7 +337,7 @@ Stage: build GFAvc.Version 0.3 GFAstats.Version 0.3.2 Samtools.Version 1.19 - wfmash.Version 0.10.5 + wfmash.Version 0.17.0 seqwish.Version 0.7.9 htslib.Version 1.19.1 minimap2.Version 2.26 @@ -363,7 +363,7 @@ Stage: build - GFAstats v0.3.2 - samtools v1.19 - bgzip v1.19.1 - - wfmash v0.10.5 + - wfmash v0.17.0 - seqwish v0.7.9 - minimap2 v2.26 - vcfbub v0.1.0 -- GitLab From c77499c2e10b85b05cfebf6fc7b0556bbb7b70f1 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 5 Aug 2024 17:54:08 +0200 Subject: [PATCH 29/30] GFAvc v0.4 Changelog : - Added support for gzipped GFA - Stripping escape character when parsing - Added GFA1.0 to GFA1.1 conversion direction (intended for PGGB GFA) - Optional index can be passed to add START-END range to walks when converting from GFA1.0 to GFA1.0 Note on GFA1.0 to GF1.1 conversion : As PGGB has no reference, every sample is set as reference (i.e. added to the space separated list from the GFA header under RS:Z:...). This simplify the use of VG tools such as vg surject which only surject onto reference walks. This also prevent paths/walks from having different names between vg and non-vg file format (no phase_block) --- GFAvc.py | 187 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 150 insertions(+), 37 deletions(-) diff --git a/GFAvc.py b/GFAvc.py index 5cf601f..f130f10 100644 --- a/GFAvc.py +++ b/GFAvc.py @@ -2,33 +2,45 @@ # -*- coding: utf-8 -*- """ GFAvc: GFA version converter. -Convert GFA from v1.1 to v1.0 (Convert walk to paths) +Convert GFA from v1.1 to v1.0 (Convert walk to paths) and vice versa. @author: alexis.mergez@inrae.fr -@version: 0.3 +@version: 0.4 """ import re import argparse import os +import gzip -version = "0.3" +version = "0.4" ## Argument parser arg_parser = argparse.ArgumentParser(description='GFAvc: GFA version converter') arg_parser.add_argument( - "--gfa", + "--gfa1", "-g", - dest = "gfa", - required = True, - help = "GFA 1.1 file." - ) + dest = "GFA1", + help = "GFA 1.1 file. (Gzip or not)" + ) +arg_parser.add_argument( + "--gfa", + "-G", + dest = "GFA", + help = "GFA 1.0 file. (Gzip or not)" + ) arg_parser.add_argument( "--outName", "-o", dest = "outName", required = True, help = "Output file name." - ) + ) +arg_parser.add_argument( + "--index", + "-i", + dest = "index", + help = "TSV containing start stop for each path in order to convert to walk (optional)" + ) arg_parser.add_argument( '--version', '-v', @@ -38,45 +50,146 @@ arg_parser.add_argument( ) args = arg_parser.parse_args() +#% Returning version if args.version: print(version) - os._exit(0) + os._exit(0) + +#% Parsing index +if args.index is not None: + with open(args.index, 'r') as handle: + file = [line.rstrip() for line in handle.readlines()] + + index = {} + for line in file: + split = line.split("\t") + index[split[0]] = [split[1], split[2]] + +else : index = None + +#% Conversion functions +def gfa11_to_gfa10(gfa1_file = args.GFA1): + #% Reading GFA + # If not gzipped : + if gfa1_file[-2:] != "gz" : + with open(gfa1_file, 'r') as file: + gfa = [line.rstrip() for line in file.readlines()] + + # If gzipped : + else : + with gzip.open(gfa1_file, 'r') as file: + gfa = [line.decode().rstrip() for line in file.readlines()] + + #% Changing version number in header + assert gfa[0].split('\t')[1] == "VN:Z:1.1" + _ = gfa[0].split('\t') + _[1] = "VN:Z:1.0" + gfa[0] = "\t".join(_) + + #% Iterating in reverse to put paths at the end. + for lineID in sorted(range(len(gfa)), reverse = True): + if gfa[lineID][0] == "S" : + _ = '\t'.join(gfa[lineID].split('\t')[:3]) + gfa[lineID] = f"{_}" + + if gfa[lineID][0] == "W" : + + curLine = gfa.pop(lineID).split('\t') + + # Transforming '>..>..<..>..' to ['>..', '>..', '<..', '>..'] + curWalk = re.findall(r'>\w+|<\w+', curLine[-1]) + + # Converting ['>..', '>..', '<..', '>..'] to '..+,..+,..-,..+' + path = [f'{elem[1:]}{(elem[0] == ">")*"+"+(elem[0] == "<")*"-"}' for elem in curWalk] + + newLine = ['P', f"{'#'.join(curLine[1:5])}-{curLine[5]}", ','.join(path), '*'] + + gfa.append('\t'.join(newLine)) + + #% Moving path lines to the end + if gfa[lineID][0] == "P" : + + curLine = gfa.pop(lineID) + gfa.append(curLine) + + return gfa + +def gfa10_to_gfa11(gfa_file = args.GFA, index = index): + + #% Reading GFA + # If not gzipped : + if gfa_file[-2:] != "gz" : + with open(gfa_file, 'r') as file: + gfa = [line.rstrip() for line in file.readlines()] + + # If gzipped : + else : + with gzip.open(gfa_file, 'r') as file: + gfa = [line.decode().rstrip() for line in file.readlines()] + + sign = {"+": ">", "-": "<"} + samples = [] + + #% Iterating in reverse to put walks at the end. + for lineID in sorted(range(len(gfa)), reverse = True): + + if gfa[lineID][0] == "P" : + + curLine = gfa.pop(lineID).split('\t') -with open(args.gfa, 'r') as file: - gfa = file.readlines() - # for line in file : + # Converting '..+,..+,..-,..+' to ['>..', '>..', '<..', '>..'] + walk = [ + f"{sign[elem[-1]]}{elem[:-1]}" + for elem in curLine[-2].split(',') + ] -## Changing version number in header -assert gfa[0].split('\t')[1] == "VN:Z:1.1" -_ = gfa[0].split('\t') -_[1] = "VN:Z:1.0" -gfa[0] = "\t".join(_) + splittedID = curLine[1].split(":") + if len(splittedID) == 2: + # Range info is available + ID = splittedID[0].split("#") + RANGE = splittedID[1].split("-") + elif len(splittedID) == 1 and index is not None: + ID = splittedID[0].split("#") + RANGE = index[splittedID[0]] + else : + ID = splittedID[0].split("#") + RANGE = 2*["*"] + + newLine = ['W'] + ID + RANGE + [f"{''.join(walk)}"] + samples.append(ID[0]) -## Iterating in reverse to put paths at the end. -for lineID in sorted(range(len(gfa)), reverse = True): - if gfa[lineID][0] == "S" : - _ = '\t'.join(gfa[lineID].split('\t')[:3]) - gfa[lineID] = f"{_}\n" + gfa.append('\t'.join(newLine)) - if gfa[lineID][0] == "W" : + #% Moving walk lines to the end + if gfa[lineID][0] == "W" : - curLine = gfa.pop(lineID).strip().split('\t') - - # Transforming '>..>..<..>..' to ['>..', '>..', '<..', '>..'] - curWalk = re.findall(r'>\w+|<\w+', curLine[-1]) + curLine = gfa.pop(lineID) + gfa.append(curLine) - # Converting ['>..', '>..', '<..', '>..'] to '..+,..+,..-,..+' - path = [f'{elem[1:]}{(elem[0] == ">")*"+"+(elem[0] == "<")*"-"}' for elem in curWalk] + samples = list(set(samples)) + #% Changing version number in header + assert gfa[0].split('\t')[1] == "VN:Z:1.0" + _ = gfa[0].split('\t') + _[1] = "VN:Z:1.1" + _.append(f"RS:Z:{' '.join(samples)}") + gfa[0] = "\t".join(_) - newLine = ['P', f"{'#'.join(curLine[1:5])}-{curLine[5]}", ','.join(path), '*\n'] + return gfa - gfa.append('\t'.join(newLine)) +#% Selecting the conversion direction +if args.GFA is None and args.GFA1 is not None: + print("[GFAvc] Converting from GFA 1.1 to GFA 1.0 ...") + gfa = gfa11_to_gfa10() -## Moving path lines to the end - if gfa[lineID][0] == "P" : +elif args.GFA is not None and args.GFA1 is None and args.refname is not None: + print("[GFAvc] Converting from GFA 1.0 to GFA 1.1 ...") + gfa = gfa10_to_gfa11() - curLine = gfa.pop(lineID) - gfa.append(curLine) +else: + print("[GFAvc] Unable to convert !") + os._exit(1) +#% Exporting +gfa[-1] = f"{gfa[-1]}\n" with open(args.outName, "w") as file: - file.write("".join(gfa)) \ No newline at end of file + file.write("\n".join(gfa)) \ No newline at end of file -- GitLab From 76a571bbe0328bb858451b95e61d7d88c707a1d8 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 5 Aug 2024 17:54:21 +0200 Subject: [PATCH 30/30] Removing/moving old scripts --- DotPlot_BED_corrector.py | 129 -- gaf2aln.ipynb | 2443 -------------------------------------- gaf2aln.py | 441 ------- 3 files changed, 3013 deletions(-) delete mode 100644 DotPlot_BED_corrector.py delete mode 100644 gaf2aln.ipynb delete mode 100644 gaf2aln.py diff --git a/DotPlot_BED_corrector.py b/DotPlot_BED_corrector.py deleted file mode 100644 index f4858af..0000000 --- a/DotPlot_BED_corrector.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -DotPlot Bed like file corrector. -Correct splitted paths from odgi untangle bed like file used for creating dotplots. -See Odgi documentation for the dot plot tutorial - -@author: alexis.mergez@inrae.fr -@version: 0.1 -""" -import re -import argparse -import os -import numpy as np -import time -import pandas as pd -from functools import reduce -import concurrent.futures -import gzip - -version = "0.1" - -## Argument parser -arg_parser = argparse.ArgumentParser(description='GFAstats: GFA statistics') -arg_parser.add_argument( - "--input", - "-i", - dest = "input", - required = True, - help = "Bed like file" - ) -arg_parser.add_argument( - "--output", - "-o", - dest = "output", - required = True, - help = "Output name" - ) -arg_parser.add_argument( - '--version', - '-v', - action="store_true", - dest = "version", - help = "Show version" -) -arg_parser.add_argument( - '--progress', - '-P', - action="store_true", - dest = "progress", - help = "Show progress to stdout" -) -args = arg_parser.parse_args() - -# Printing version and exiting if required -if args.version: - print(version) - os._exit(0) - -# importing bed file with pandas -if args.progress : print(f"[Bed_corrector::Parsing] Reading {args.input} ...") -bed = pd.read_csv( - args.input, - sep = '\t' -) - -# Getting the queries name and checking if we get multiples for one path -if args.progress : print(f"[Bed_corrector::Identify] Searching for splitted paths ...") -## Getting unique query names -queries = bed["query.name"].unique() - -## Extracting path name and ranges from unique queries names -paths = [query.split(":")[0] for query in queries] -ranges = np.array([query.split(":")[1].split("-") for query in queries]) - -## Creating temporary dataframe to store previous info -temp_df = pd.DataFrame({ - "queries" : queries, - "path" : paths, - "start" : ranges[:, 0], - "end" : ranges[:, 1] -}) - -## Getting the path that are splitted (i.e. more than one unique occurence) -splitted_paths = [] -_ = np.unique(paths, return_counts = True) -for path, count in zip(_[0], _[1]): - if count > 1 : # More than a repeat - splitted_paths.append(path) - if args.progress : print(f"[Bed_corrector::Identify] {path} is splitted") - -# For each splitted path identified, we search the minimum start and the maximum end -if args.progress : print(f"[Bed_corrector::Identify] Searching for min start and max end of splitted paths ...") -min_start = {} -max_end = {} -for path_name in splitted_paths: - min_start[path_name] = int(temp_df[temp_df.path == path_name].start.min()) - max_end[path_name] = int(temp_df[temp_df.path == path_name].end.max()) - -# Traversing bed dataframe and offsetting paths based on the min_start of the main path -if args.progress : print(f"[Bed_corrector::Patching] Correcting splitted paths ...") -## Getting a copy of columns to modify -path_names, starts, ends = bed["query.name"].tolist(), bed["query.start"].tolist(), bed["query.end"].tolist() - -## Iterating over lines -for i in range(len(path_names)): - ## Getting the name and the range of the current line query - path, ranges = path_names[i].split(":") - - if path in splitted_paths: - #print("Before :", path_names[i], starts[i], ends[i]) - - ## Computing offset based on min start for this path - offset = (int(ranges.split('-')[0])-min_start[path]) - #print("Offset:", offset) - - ## Patching the info with new range (min start, max end) and shifted coordinates - path_names[i] = f"{path}:{min_start[path]}-{max_end[path]}" - starts[i] = int(starts[i])+offset - ends[i] = int(ends[i])+offset - #print("After :", path_names[i], starts[i], ends[i]) - -## Patching the bed -bed["query.name"] = path_names -bed["query.start"] = starts -bed["query.end"] = ends - -# Exporting -bed.to_csv(args.output, sep="\t", index = False) \ No newline at end of file diff --git a/gaf2aln.ipynb b/gaf2aln.ipynb deleted file mode 100644 index 04fe866..0000000 --- a/gaf2aln.ipynb +++ /dev/null @@ -1,2443 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "4ffaf9f6-cc1e-4190-9351-5431c930d25b", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import argparse\n", - "import concurrent.futures\n", - "import os\n", - "import re\n", - "\n", - "# Replace for argparse arguments\n", - "class arguments():\n", - " gfa = \"/home/amergez/Documents/Scratch/LeChou/35Bra-v2a/35Bra-v2a.chr03.gfa\"\n", - " gaf = \"/home/amergez/Documents/Scratch/LeChou/35Bra-v2a/Mapping2Graph/GA.FLC2.aln.gaf\"\n", - " threads = 8\n", - " version = False\n", - "args = arguments()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "280c8847-22e8-4063-bde8-3e4e72cf20e7", - "metadata": {}, - "outputs": [], - "source": [ - "# Toolbox\n", - "def walk2path(walk):\n", - " \"\"\"\n", - " Takes a walk in a single string and returns a list of nodes id with signs (gfa v1 like)\n", - " \"\"\"\n", - " _ = re.findall(r'>\\w+|<\\w+', walk)\n", - " # Converting ['>..', '>..', '<..', '>..'] to '..+,..+,..-,..+'\n", - " return [f'{elem[1:]}{(elem[0] == \">\")*\"+\"+(elem[0] == \"<\")*\"-\"}' for elem in _]\n", - "\n", - "def cigar2basealn(cigar):\n", - " \"\"\"\n", - " Takes a CIGAR string and convert it into a list of base level alignment.\n", - " For example : \"345=\" -> [\"=\", \"=\", ..., \"=\"] of length 345.\n", - " \"\"\"\n", - " _ = re.findall(r'\\d+\\D', cigar)\n", - " final_cigar = []\n", - " for match in _:\n", - " final_cigar += [match[-1]]*int(match[:-1])\n", - "\n", - " return final_cigar\n", - "\n", - "def basealn2cigar(base_aln_list):\n", - " \n", - " last_elem = base_aln_list[0]\n", - " CIGAR = [[1, last_elem]]\n", - " for elem in base_aln_list[1:]:\n", - " if elem == last_elem:\n", - " CIGAR[-1][0] += 1\n", - "\n", - " else :\n", - " CIGAR[-1][0] = str(CIGAR[-1][0])\n", - " CIGAR.append([1, elem])\n", - " last_elem = elem\n", - " CIGAR[-1][0] = str(CIGAR[-1][0])\n", - " return \"\".join([\"\".join(block) for block in CIGAR if block[1] != \"\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "be12e9d4-de76-4c8b-af84-6567549483f4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[gaf2aln::GAF Parser] Reading /home/amergez/Documents/Scratch/LeChou/35Bra-v2a/Mapping2Graph/GA.FLC2.aln.gaf ...\n", - "[gaf2aln::GAF Parser] Extracting alignments ...\n", - "{'ALN_1': {'QRY.NAME': 'FLC2.TO1000#1#chr03', 'QRY.LEN': '3735', 'QRY.START': '0', 'QRY.END': '3735', 'STRAND': '+', 'PATH.MATCH': [('7046526', '+'), ('7046528', '+'), ('7046530', '+'), ('7046531', '+'), ('7046532', '+'), ('7046533', '+'), ('7046534', '+'), ('7046536', '+'), ('7046537', '+'), ('7046539', '+'), ('7046541', '+'), ('7046542', '+'), ('7046544', '+'), ('7046546', '+'), ('7046547', '+'), ('7046549', '+'), ('7046551', '+'), ('7046552', '+'), ('7046554', '+'), ('7046556', '+'), ('7046556', '+'), ('7046556', '+'), ('7046557', '+'), ('7046558', '+'), ('7046559', '+'), ('7046560', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046561', '+'), ('7046562', '+'), ('7046564', '+'), ('7046565', '+'), ('7046567', '+'), ('7046568', '+'), ('7046570', '+'), ('7046571', '+'), ('7046573', '+'), ('7046574', '+'), ('7046576', '+'), ('7046577', '+'), ('7046579', '+'), ('7046581', '+'), ('7046583', '+'), ('7046584', '+'), ('7046586', '+'), ('7046587', '+'), ('7046589', '+'), ('7046590', '+'), ('7046592', '+'), ('7046593', '+'), ('7046594', '+'), ('7046596', '+'), ('7046597', '+'), ('7046599', '+'), ('7046600', '+'), ('7046601', '+'), ('7046603', '+'), ('7046604', '+'), ('7046606', '+'), ('7046608', '+'), ('7046609', '+'), ('7046621', '+'), ('7046622', '+'), ('7046624', '+'), ('7046625', '+'), ('7046626', '+'), ('7046628', '+'), ('7046631', '+'), ('7046673', '+'), ('7046631', '+'), ('7046673', '+'), ('7046631', '+'), ('7046632', '+'), ('7046634', '+'), ('7046635', '+'), ('7046637', '+'), ('7046638', '+'), ('7046639', '+'), ('7046641', '+'), ('7046644', '+'), ('7046646', '+'), ('7046647', '+'), ('7046649', '+'), ('7046650', '+'), ('7046652', '+'), ('7046653', '+'), ('7046654', '+'), ('7046656', '+'), ('7046657', '+'), ('7046659', '+'), ('7046660', '+'), ('7046662', '+'), ('7046663', '+'), ('7046665', '+'), ('7046667', '+'), ('7046668', '+'), ('7046670', '+'), ('7046671', '+'), ('7046674', '+'), ('7046675', '+'), ('7046674', '+'), ('7046675', '+'), ('7046676', '+'), ('7046678', '+'), ('7046679', '+'), ('7046680', '+'), ('7046682', '+'), ('7046684', '+'), ('7046685', '+'), ('7046686', '+'), ('7046688', '+'), ('7046690', '+'), ('7046692', '+'), ('7046693', '+'), ('7046695', '+'), ('7046696', '+'), ('7046698', '+'), ('7046700', '+'), ('7046702', '+'), ('7046703', '+'), ('7046704', '+'), ('7046706', '+'), ('7046707', '+'), ('7046709', '+'), ('7046710', '+'), ('7046712', '+'), ('7046713', '+'), ('7046715', '+'), ('7046718', '+'), ('7046717', '+'), ('7046718', '+'), ('7046717', '+'), ('7046718', '+'), ('7046720', '+'), ('7046722', '+'), ('7046724', '+'), ('7046725', '+'), ('7046727', '+'), ('7046728', '+'), ('7046729', '+'), ('7046730', '+'), ('7046731', '+'), ('7046733', '+'), ('7046735', '+'), ('7046736', '+'), ('7046738', '+'), ('7046739', '+'), ('7046740', '+'), ('7046738', '+'), ('7046739', '+'), ('7046740', '+'), ('7046738', '+'), ('7046739', '+'), ('7046741', '+')], 'PATH.LEN': '3822', 'ALN.START': '77', 'ALN.END': '3812', 'RES.MATCH': '3735', 'ALN.BLOCK.LEN': '3735', 'MAPPING.QUAL': '60', 'RAW.CIGAR': 'cg:Z:3735=', 'TAGS': 'AS:f:3735,dv:f:0,id:f:1'}, 'ALN_2': {'QRY.NAME': 'FLC2.TO1000#1#chr03', 'QRY.LEN': '3735', 'QRY.START': '0', 'QRY.END': '3735', 'STRAND': '+', 'PATH.MATCH': [('7594382', '+'), ('7594369', '+'), ('7594371', '+'), ('7594021', '+'), ('7594286', '+'), ('7594374', '+'), ('7594356', '+'), ('7594374', '+'), ('7594374', '+'), ('7594375', '+'), ('7594626', '+'), ('7594011', '+'), ('7594374', '+'), ('7594375', '+'), ('7594369', '+'), ('7594371', '+'), ('7594021', '+'), ('7594021', '+'), ('7594021', '+'), ('7594021', '+'), ('7594241', '+'), ('7594248', '+'), ('7594286', '+'), ('7594311', '+'), ('7594315', '+'), ('7594311', '+'), ('7594330', '+'), ('7594311', '+'), ('7594315', '+'), ('7594374', '+'), ('7594311', '+'), ('7594374', '+'), ('7594369', '+'), ('7594021', '+'), ('7594026', '+'), ('7594021', '+'), ('7594021', '+'), ('7594026', '+'), ('7594021', '+'), ('7594021', '+'), ('7594021', '+'), ('7594021', '+'), ('7594286', '+'), ('7594374', '+'), ('7594021', '+'), ('7594286', '+'), ('7594311', '+'), ('7594286', '+'), ('7594311', '+'), ('7594286', '+'), ('7594311', '+'), ('7594286', '+'), ('7594311', '+'), ('7594315', '+'), ('7594286', '+'), ('7594311', '+'), ('7594374', '+'), ('7594021', '+'), ('7594286', '+'), ('7594286', '+'), ('7594374', '+'), ('7594356', '+'), ('7594374', '+'), ('7594374', '+'), ('7594375', '+'), ('7594374', '+'), ('7594356', '+'), ('7594374', '+'), ('7594375', '+'), ('7594374', '+'), ('7594350', '+'), ('7594264', '+'), ('7594207', '+'), ('7594225', '+'), ('7594227', '+'), ('7594120', '+'), ('7594132', '+'), ('7594165', '+'), ('7594172', '+')], 'PATH.LEN': '61224', 'ALN.START': '0', 'ALN.END': '3735', 'RES.MATCH': '3734', 'ALN.BLOCK.LEN': '3735', 'MAPPING.QUAL': '0', 'RAW.CIGAR': 'cg:Z:57=1X3677=', 'TAGS': 'AS:f:3732.06,dv:f:0.000267738,id:f:0.999732'}}\n" - ] - } - ], - "source": [ - "# Parsing the .gaf file\n", - "print(f\"[gaf2aln::GAF Parser] Reading {args.gaf} ...\")\n", - "with open(args.gaf, 'r') as file:\n", - " gaf_lines = file.readlines()\n", - "\n", - "gaf_col = [\n", - " \"QRY.NAME\", \"QRY.LEN\", \"QRY.START\", \"QRY.END\", \"STRAND\", \n", - " \"PATH.MATCH\", \"PATH.LEN\", \"ALN.START\", \"ALN.END\",\n", - " \"RES.MATCH\", \"ALN.BLOCK.LEN\", \"MAPPING.QUAL\"\n", - " ]\n", - "\n", - "# Creating dictionnary to store alignments\n", - "print(f\"[gaf2aln::GAF Parser] Extracting alignments ...\")\n", - "aln_dict = {}\n", - "for line in range(len(gaf_lines)):\n", - " ## Splitting the line by tabulation\n", - " line_content = gaf_lines[line][:-1].split('\\t')\n", - "\n", - " ## Adding alignement info to dictionnary\n", - " aln_dict[f\"ALN_{line+1}\"] = {\n", - " gaf_col[i]: line_content[i] for i in range(len(gaf_col))\n", - " }\n", - " \n", - " ## Splitting \"PATH.MATCH\" into a list\n", - " aln_dict[f\"ALN_{line+1}\"][\"PATH.MATCH\"] = [\n", - " (str(node_id[:-1]), node_id[-1]) \n", - " for node_id in walk2path(aln_dict[f\"ALN_{line+1}\"][\"PATH.MATCH\"])\n", - " ]\n", - "\n", - " ## Adding CIGAR\n", - " aln_dict[f\"ALN_{line+1}\"][\"RAW.CIGAR\"] = line_content[-1]\n", - "\n", - " ## Adding tags\n", - " aln_dict[f\"ALN_{line+1}\"][\"TAGS\"] = \",\".join(line_content[13:-1])\n", - "\n", - "# Getting nodes of interest ids\n", - "aln_nodes = np.unique([\n", - " str(node_id) \n", - " for aln in aln_dict.keys() \n", - " for node_id, orient in aln_dict[aln][\"PATH.MATCH\"]\n", - "]).tolist()\n", - "\n", - "print(aln_dict)\n", - "del gaf_lines, gaf_col" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "2f891424-0d88-4fd3-99ff-b0a8c90587ff", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[gaf2aln::GFA Parser] Reading /home/amergez/Documents/Scratch/LeChou/35Bra-v2a/35Bra-v2a.chr03.gfa ...\n", - "[gaf2aln::GFA Parser] Extracting nodes, paths and links ...\n" - ] - } - ], - "source": [ - "# Parsing the .gfa\n", - "print(f\"[gaf2aln::GFA Parser] Reading {args.gfa} ...\")\n", - "with open(args.gfa, 'r') as file:\n", - " gfa_lines = file.readlines()\n", - "\n", - "# Nodes length dictionnary structured as follow :\n", - "# {<NODE.ID>: <NODE.LENGTH>}\n", - "nodes_length = {}\n", - "# Nodes dictionnary structured as follow :\n", - "# { <ALN.NODE.ID> : {\n", - "# <PATH.NAME>: {\"START\": start, \"END\": end, \"STRAND\": strand), \n", - "# <ALN.NAME>: {\"START\": start, \"END\": end, \"S.OFF\": start.offset, \"E.OFF\": end.offset, \"STRAND\": strand, \"CIGAR\": CIGAR}\n", - "# }\n", - "# }\n", - "nodes = {node_id: {} for node_id in aln_nodes}\n", - "# Paths dictionnary structured as follow :\n", - "# {<PATH.NAME>: {NODES: {<NODE.ID>: <NODE.ORIENT>}, CIGAR: <CIGAR in comma separated list>}\n", - "paths = {}\n", - "# Links dictionnary structured as follow : \n", - "# {<FROM.NODE.ID>: {<TO.NODE.ID>: {FROM.ORIENT: <FROM.ORIENT>, TO.ORIENT: <TO.ORIENT>}}}\n", - "links = {}\n", - "\n", - "# Parsing the gfa\n", - "print(f\"[gaf2aln::GFA Parser] Extracting nodes, paths and links ...\")\n", - "\n", - "def GFA_parser(gfa_lines, nodes = nodes):\n", - " _links, _nodes, _nodes_length, paths = {}, {}, {}, {}\n", - " for line in gfa_lines:\n", - " line_content = line[:-1].split(\"\\t\")\n", - " line_id = line_content[0]\n", - " \n", - " # Segment line\n", - " if line_id == \"S\" :\n", - " \n", - " _nodes_length[str(line_content[1])] = len(line_content[2])\n", - " \n", - " # Link line\n", - " elif line_id == \"L\":\n", - " try :\n", - " _links[str(line_content[1])][str(line_content[3])] = {\n", - " \"FROM\": str(line_content[2]), \n", - " \"TO\": str(line_content[4])\n", - " }\n", - "\n", - " except :\n", - " _links[str(line_content[1])] = {\n", - " str(line_content[3]) : {\"FROM.ORIENT\": str(line_content[2]), \"TO.ORIENT\": str(line_content[4])}\n", - " }\n", - "\n", - " # Path line\n", - " elif line_id == \"P\":\n", - " _paths[str(line_content[1])] = {\n", - " \"NODES\": {\n", - " str(node_id[:-1]): str(node_id[-1])\n", - " for node_id in line_content[2].split(',')\n", - " },\n", - " \"CIGAR\": line_content[3]\n", - " }\n", - "\n", - " return [_links, _nodes, _nodes_length, _paths]\n", - "\n", - "# splits = np.quantile(range(len(gfa_lines)+1), q= np.array(args.threads+1)/args.threads, method='higher').tolist()\n", - "# res = []\n", - "# for i in range(1, len(splits)):\n", - "# res.append(executor.submit(GFA_parser, gfa_lines[splits[i-1]:splits[i]]))\n", - "\n", - "# for out in res:\n", - "# results = out.result()\n", - "\n", - "# for link_id, link_info in results[0].items():\n", - "# links[]\n", - "\n", - "\n", - "for line in gfa_lines:\n", - " line_content = line[:-1].split(\"\\t\")\n", - " line_id = line_content[0]\n", - " \n", - " # Segment line\n", - " if line_id == \"S\" :\n", - " \n", - " nodes_length[str(line_content[1])] = len(line_content[2])\n", - " \n", - " # Link line\n", - " elif line_id == \"L\":\n", - " try :\n", - " links[str(line_content[1])][str(line_content[3])] = {\n", - " \"FROM\": str(line_content[2]), \n", - " \"TO\": str(line_content[4])\n", - " }\n", - "\n", - " except :\n", - " links[str(line_content[1])] = {\n", - " str(line_content[3]) : {\"FROM.ORIENT\": str(line_content[2]), \"TO.ORIENT\": str(line_content[4])}\n", - " }\n", - "\n", - " # Path line\n", - " elif line_id == \"P\":\n", - " paths[str(line_content[1])] = {\n", - " \"NODES\": {\n", - " str(node_id[:-1]): str(node_id[-1])\n", - " for node_id in line_content[2].split(',')\n", - " },\n", - " \"CIGAR\": line_content[3]\n", - " }\n", - "\n", - "del gfa_lines" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "a403c88e-54ea-4a67-9047-dc44eba7f51a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[gaf2aln::Graph position processing] Computing nodes positions in each paths...\n", - "[gaf2aln::Graph position processing] Running on Capitata#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on D101#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on D134#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on G06-09-28#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on G07-DH-33#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on HDEM#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on Korso#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on M249#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on OX-heart#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on PL021#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on RC34#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T02#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T03#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T04#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T06#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T07#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T08#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T09#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T10#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T11#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T12#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T13#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T14#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T15#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T16#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T17#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T18#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T19#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T21#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T24#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T25#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T26#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on T27#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on TO1000#1#chr03 ...\n", - "[gaf2aln::Graph position processing] Running on W1701#1#chr03 ...\n" - ] - } - ], - "source": [ - "print(f\"[gaf2aln::Graph position processing] Computing nodes positions in each paths...\")\n", - "def get_node_pos(path_name, nodes = nodes, paths = paths, nodes_length = nodes_length):\n", - " print(f\"[gaf2aln::Graph position processing] Running on {path_name} ...\")\n", - " cur_pos = 0\n", - "\n", - " out = {}\n", - " # Iterating over nodes in the path\n", - " for path_node in paths[path_name][\"NODES\"].keys():\n", - " # Instead of checking if the node is one interesting node, we try to add to the nodes dict\n", - " if path_node in aln_nodes :\n", - " out[path_node] = {\n", - " \"START\": cur_pos, # Start position of the node start in the currrent path\n", - " \"END\": cur_pos+nodes_length[path_node], # End position of the node end in the current path\n", - " \"STRAND\": paths[path_name][\"NODES\"][path_node] # Orientation of the node in the current path\n", - " } \n", - "\n", - " cur_pos += nodes_length[path_node]+1\n", - " else :\n", - " cur_pos += nodes_length[path_node]+1\n", - "\n", - " return out\n", - "\n", - "res = {}\n", - "executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.threads)\n", - "# Adding nodes positions relative to path\n", - "for path_name in paths.keys():\n", - " res[path_name] = executor.submit(get_node_pos, path_name)\n", - "\n", - "executor.shutdown(wait=True)\n", - "\n", - "for path_name, out in res.items():\n", - " results = out.result()\n", - " for path_node, node_pos in results.items():\n", - " nodes[path_node][path_name] = node_pos\n", - "\n", - "del res" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "bed36bd5-30eb-4d02-8b52-1ae5d753f8f8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[gaf2aln::Alignment position processing] Computing nodes positions in each alignement...\n", - "[gaf2aln::Alignment position processing] Running on ALN_1 ...\n", - "0 77 77 0 + 154 77\n", - "77 82 0 0 + 5 82\n", - "82 83 0 0 + 1 83\n", - "83 138 0 0 + 55 138\n", - "138 139 0 0 + 1 139\n", - "139 202 0 0 + 63 202\n", - "202 203 0 0 + 1 203\n", - "203 379 0 0 + 176 379\n", - "379 380 0 0 + 1 380\n", - "380 429 0 0 + 49 429\n", - "429 430 0 0 + 1 430\n", - "430 457 0 0 + 27 457\n", - "457 492 0 0 + 35 492\n", - "492 494 0 0 + 2 494\n", - "494 497 0 0 + 3 497\n", - "497 507 0 0 + 10 507\n", - "507 508 0 0 + 1 508\n", - "508 564 0 0 + 56 564\n", - "564 566 0 0 + 2 566\n", - "566 567 0 0 + 1 567\n", - "567 568 0 0 + 1 568\n", - "568 569 0 0 + 1 569\n", - "569 824 0 0 + 255 824\n", - "824 826 0 0 + 2 826\n", - "826 858 0 0 + 32 858\n", - "858 859 0 0 + 1 859\n", - "859 860 0 0 + 1 860\n", - "860 861 0 0 + 1 861\n", - "861 862 0 0 + 1 862\n", - "862 863 0 0 + 1 863\n", - "863 864 0 0 + 1 864\n", - "864 865 0 0 + 1 865\n", - "865 866 0 0 + 1 866\n", - "866 867 0 0 + 1 867\n", - "867 868 0 0 + 1 868\n", - "868 869 0 0 + 1 869\n", - "869 913 0 0 + 44 913\n", - "913 919 0 0 + 6 919\n", - "919 978 0 0 + 59 978\n", - "978 979 0 0 + 1 979\n", - "979 1038 0 0 + 59 1038\n", - "1038 1045 0 0 + 7 1045\n", - "1045 1046 0 0 + 1 1046\n", - "1046 1080 0 0 + 34 1080\n", - "1080 1081 0 0 + 1 1081\n", - "1081 1107 0 0 + 26 1107\n", - "1107 1108 0 0 + 1 1108\n", - "1108 1183 0 0 + 75 1183\n", - "1183 1186 0 0 + 3 1186\n", - "1186 1224 0 0 + 38 1224\n", - "1224 1257 0 0 + 33 1257\n", - "1257 1289 0 0 + 32 1289\n", - "1289 1311 0 0 + 22 1311\n", - "1311 1359 0 0 + 48 1359\n", - "1359 1382 0 0 + 23 1382\n", - "1382 1434 0 0 + 52 1434\n", - "1434 1451 0 0 + 17 1451\n", - "1451 1531 0 0 + 80 1531\n", - "1531 1532 0 0 + 1 1532\n", - "1532 1543 0 0 + 11 1543\n", - "1543 1544 0 0 + 1 1544\n", - "1544 1572 0 0 + 28 1572\n", - "1572 1573 0 0 + 1 1573\n", - "1573 1587 0 0 + 14 1587\n", - "1587 1588 0 0 + 1 1588\n", - "1588 1616 0 0 + 28 1616\n", - "1616 1617 0 0 + 1 1617\n", - "1617 1646 0 0 + 29 1646\n", - "1646 1661 0 0 + 15 1661\n", - "1661 1673 0 0 + 12 1673\n", - "1673 1674 0 0 + 1 1674\n", - "1674 1726 0 0 + 52 1726\n", - "1726 1727 0 0 + 1 1727\n", - "1727 1762 0 0 + 35 1762\n", - "1762 1763 0 0 + 1 1763\n", - "1763 1764 0 0 + 1 1764\n", - "1764 1765 0 0 + 1 1765\n", - "1765 1766 0 0 + 1 1766\n", - "1766 1767 0 0 + 1 1767\n", - "1767 1824 0 0 + 57 1824\n", - "1824 1825 0 0 + 1 1825\n", - "1825 1975 0 0 + 150 1975\n", - "1975 1976 0 0 + 1 1976\n", - "1976 2015 0 0 + 39 2015\n", - "2015 2016 0 0 + 1 2016\n", - "2016 2047 0 0 + 31 2047\n", - "2047 2055 0 0 + 8 2055\n", - "2055 2056 0 0 + 1 2056\n", - "2056 2120 0 0 + 64 2120\n", - "2120 2121 0 0 + 1 2121\n", - "2121 2157 0 0 + 36 2157\n", - "2157 2158 0 0 + 1 2158\n", - "2158 2170 0 0 + 12 2170\n", - "2170 2171 0 0 + 1 2171\n", - "2171 2205 0 0 + 34 2205\n", - "2205 2206 0 0 + 1 2206\n", - "2206 2344 0 0 + 138 2344\n", - "2344 2345 0 0 + 1 2345\n", - "2345 2364 0 0 + 19 2364\n", - "2364 2383 0 0 + 19 2383\n", - "2383 2408 0 0 + 25 2408\n", - "2408 2409 0 0 + 1 2409\n", - "2409 2441 0 0 + 32 2441\n", - "2441 2442 0 0 + 1 2442\n", - "2442 2580 0 0 + 138 2580\n", - "2580 2581 0 0 + 1 2581\n", - "2581 2582 0 0 + 1 2582\n", - "2582 2583 0 0 + 1 2583\n", - "2583 2584 0 0 + 1 2584\n", - "2584 2764 0 0 + 180 2764\n", - "2764 2765 0 0 + 1 2765\n", - "2765 2797 0 0 + 32 2797\n", - "2797 2798 0 0 + 1 2798\n", - "2798 2878 0 0 + 80 2878\n", - "2878 2879 0 0 + 1 2879\n", - "2879 2951 0 0 + 72 2951\n", - "2951 2952 0 0 + 1 2952\n", - "2952 3002 0 0 + 50 3002\n", - "3002 3077 0 0 + 75 3077\n", - "3077 3078 0 0 + 1 3078\n", - "3078 3093 0 0 + 15 3093\n", - "3093 3094 0 0 + 1 3094\n", - "3094 3097 0 0 + 3 3097\n", - "3097 3140 0 0 + 43 3140\n", - "3140 3210 0 0 + 70 3210\n", - "3210 3211 0 0 + 1 3211\n", - "3211 3229 0 0 + 18 3229\n", - "3229 3230 0 0 + 1 3230\n", - "3230 3276 0 0 + 46 3276\n", - "3276 3277 0 0 + 1 3277\n", - "3277 3315 0 0 + 38 3315\n", - "3315 3316 0 0 + 1 3316\n", - "3316 3322 0 0 + 6 3322\n", - "3322 3323 0 0 + 1 3323\n", - "3323 3348 0 0 + 25 3348\n", - "3348 3349 0 0 + 1 3349\n", - "3349 3350 0 0 + 1 3350\n", - "3350 3351 0 0 + 1 3351\n", - "3351 3352 0 0 + 1 3352\n", - "3352 3353 0 0 + 1 3353\n", - "3353 3354 0 0 + 1 3354\n", - "3354 3356 0 0 + 2 3356\n", - "3356 3357 0 0 + 1 3357\n", - "3357 3489 0 0 + 132 3489\n", - "3489 3490 0 0 + 1 3490\n", - "3490 3642 0 0 + 152 3642\n", - "3642 3644 0 0 + 2 3644\n", - "3644 3685 0 0 + 41 3685\n", - "3685 3687 0 0 + 2 3687\n", - "3687 3693 0 0 + 6 3693\n", - "3693 3694 0 0 + 1 3694\n", - "3694 3708 0 0 + 14 3708\n", - "3708 3709 0 0 + 1 3709\n", - "3709 3710 0 0 + 1 3710\n", - "3710 3714 0 0 + 4 3714\n", - "3714 3715 0 0 + 1 3715\n", - "3715 3716 0 0 + 1 3716\n", - "3716 3720 0 0 + 4 3720\n", - "3720 3721 0 0 + 1 3721\n", - "3721 3722 0 0 + 1 3722\n", - "3722 3735 0 10 + 23 3735\n", - "[gaf2aln::Alignment position processing] Running on ALN_2 ...\n", - "0 1 0 0 + 1 1\n", - "1 2 0 0 + 1 2\n", - "2 3 0 0 + 1 3\n", - "3 4 0 0 + 1 4\n", - "4 5 0 0 + 1 5\n", - "5 6 0 0 + 1 6\n", - "6 7 0 0 + 1 7\n", - "7 8 0 0 + 1 8\n", - "8 9 0 0 + 1 9\n", - "9 10 0 0 + 1 10\n", - "10 11 0 0 + 1 11\n", - "11 12 0 0 + 1 12\n", - "12 13 0 0 + 1 13\n", - "13 14 0 0 + 1 14\n", - "14 15 0 0 + 1 15\n", - "15 16 0 0 + 1 16\n", - "16 17 0 0 + 1 17\n", - "17 18 0 0 + 1 18\n", - "18 19 0 0 + 1 19\n", - "19 20 0 0 + 1 20\n", - "20 21 0 0 + 1 21\n", - "21 22 0 0 + 1 22\n", - "22 23 0 0 + 1 23\n", - "23 24 0 0 + 1 24\n", - "24 25 0 0 + 1 25\n", - "25 26 0 0 + 1 26\n", - "26 27 0 0 + 1 27\n", - "27 28 0 0 + 1 28\n", - "28 29 0 0 + 1 29\n", - "29 30 0 0 + 1 30\n", - "30 31 0 0 + 1 31\n", - "31 32 0 0 + 1 32\n", - "32 33 0 0 + 1 33\n", - "33 34 0 0 + 1 34\n", - "34 35 0 0 + 1 35\n", - "35 36 0 0 + 1 36\n", - "36 37 0 0 + 1 37\n", - "37 38 0 0 + 1 38\n", - "38 39 0 0 + 1 39\n", - "39 40 0 0 + 1 40\n", - "40 41 0 0 + 1 41\n", - "41 42 0 0 + 1 42\n", - "42 43 0 0 + 1 43\n", - "43 44 0 0 + 1 44\n", - "44 45 0 0 + 1 45\n", - "45 46 0 0 + 1 46\n", - "46 47 0 0 + 1 47\n", - "47 48 0 0 + 1 48\n", - "48 49 0 0 + 1 49\n", - "49 50 0 0 + 1 50\n", - "50 51 0 0 + 1 51\n", - "51 52 0 0 + 1 52\n", - "52 53 0 0 + 1 53\n", - "53 54 0 0 + 1 54\n", - "54 55 0 0 + 1 55\n", - "55 56 0 0 + 1 56\n", - "56 57 0 0 + 1 57\n", - "57 58 0 0 + 1 58\n", - "58 59 0 0 + 1 59\n", - "59 60 0 0 + 1 60\n", - "60 61 0 0 + 1 61\n", - "61 62 0 0 + 1 62\n", - "62 63 0 0 + 1 63\n", - "63 64 0 0 + 1 64\n", - "64 65 0 0 + 1 65\n", - "65 66 0 0 + 1 66\n", - "66 67 0 0 + 1 67\n", - "67 68 0 0 + 1 68\n", - "68 69 0 0 + 1 69\n", - "69 70 0 0 + 1 70\n", - "70 71 0 0 + 1 71\n", - "71 72 0 0 + 1 72\n", - "72 73 0 0 + 1 73\n", - "73 74 0 0 + 1 74\n", - "74 75 0 0 + 1 75\n", - "75 76 0 0 + 1 76\n", - "76 77 0 0 + 1 77\n", - "77 78 0 0 + 1 78\n", - "78 3735 0 57489 + 61146 3735\n" - ] - } - ], - "source": [ - "print(f\"[gaf2aln::Alignment position processing] Computing nodes positions in each alignement...\")\n", - "# Adding nodes positions relative to path\n", - "\n", - "def get_aln_node_info(aln_name, aln_dict = aln_dict, nodes_length = nodes_length):\n", - " # Initializing current position in query\n", - " cur_pos = 0\n", - "\n", - " # Getting start and end node ids\n", - " start_end_id = (aln_dict[aln_name][\"PATH.MATCH\"][0][0], aln_dict[aln_name][\"PATH.MATCH\"][-1][0])\n", - "\n", - " # Creating result dictionnary\n", - " res = {}\n", - "\n", - " ## Iterating over node_ids from the given alignment\n", - " for node_id, orient in aln_dict[aln_name][\"PATH.MATCH\"]:\n", - " # Adding entry for current node\n", - " res[node_id] = {aln_name: {}}\n", - "\n", - " # First node\n", - " if node_id == start_end_id[0]:\n", - " start_pos = 0\n", - " s_off = int(aln_dict[aln_name][\"ALN.START\"])\n", - " end_pos = nodes_length[node_id]-s_off\n", - " e_off = 0\n", - " # End node\n", - " elif node_id == start_end_id[1]:\n", - " start_pos = cur_pos\n", - " s_off = 0\n", - " end_pos = int(aln_dict[aln_name][\"QRY.END\"])\n", - " e_off = nodes_length[node_id]-(end_pos-cur_pos)\n", - " # Node in between\n", - " else :\n", - " start_pos = cur_pos\n", - " s_off, e_off = 0, 0\n", - " end_pos = cur_pos+nodes_length[node_id]\n", - "\n", - " res[node_id] = {\n", - " \"START\": start_pos, # Start position on the query\n", - " \"END\": end_pos, # End position on the query\n", - " \"S.OFF\": s_off, # Offset between the start of the alignment and the node's start\n", - " \"E.OFF\": e_off, # Offset between the end of the alignment and the node's end \n", - " \"STRAND\": orient # Orientation of the node in the alignment\n", - " }\n", - " \n", - " cur_pos = end_pos\n", - " print(start_pos, end_pos, s_off, e_off, orient, nodes_length[node_id], cur_pos)\n", - "\n", - " return res\n", - "\n", - "# Storing alignement \n", - "res = {}\n", - "executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.threads)\n", - "for aln_name in aln_dict.keys():\n", - " print(f\"[gaf2aln::Alignment position processing] Running on {aln_name} ...\")\n", - " \n", - " res[aln_name] = executor.submit(get_aln_node_info, aln_name)\n", - " #res[aln_name] = get_aln_node_info(aln_name, aln_dict = aln_dict, nodes_length = nodes_length)\n", - "\n", - "executor.shutdown(wait=True)\n", - "\n", - "for aln_name, node_info in res.items():\n", - " results = node_info.result()\n", - " for node_id, info in results.items():\n", - " nodes[node_id][aln_name] = info\n", - "\n", - "del res" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "4c30727c-7ffc-4852-ad81-ca2a5a7f9957", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[gaf2aln::CIGAR processing] Computing nodes cigar from alignement ...\n", - "[gaf2aln::CIGAR processing] Running on ALN_1 ...\n", - "[gaf2aln::CIGAR processing] Running on ALN_2 ...\n" - ] - } - ], - "source": [ - "# Calculating CIGAR for each nodes in each aln\n", - "print(f\"[gaf2aln::CIGAR processing] Computing nodes cigar from alignement ...\")\n", - "# Iterating over alignments\n", - "for aln in aln_dict.keys():\n", - " \n", - " print(f\"[gaf2aln::CIGAR processing] Running on {aln} ...\")\n", - " # Getting the list of base level alignement ([\"=\", \"X\", ...] from \"1=1X...\")\n", - " raw_cigar = cigar2basealn(aln_dict[aln][\"RAW.CIGAR\"])\n", - " CIGAR={}\n", - "\n", - " for node_id, orient in aln_dict[aln][\"PATH.MATCH\"]:\n", - "\n", - " _cigar = basealn2cigar(raw_cigar[\n", - " nodes[node_id][aln][\"START\"]:nodes[node_id][aln][\"END\"]\n", - " ])\n", - " nodes[node_id][aln][\"CIGAR\"] = _cigar\n", - " #print(_cigar, nodes[node_id][aln][\"START\"], nodes[node_id][aln][\"END\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "e15e4762-cd71-4afe-bc74-ebe44869fee6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ALN_1\n", - "7046526 D134#1#chr03 0 77\n", - "\tIn path\n", - "\t 73306158 73306235\n", - "skipped\n", - "\n", - "7046528 D134#1#chr03 77 82\n", - "\tIn path\n", - "\t 73306238 73306243\n", - "{'Q.START': 0, 'Q.END': 77, 'T.START': 73306158, 'T.END': 73306235, 'CG': '77='}\n", - "7046530 D134#1#chr03 82 83\n", - "\tNot in path\n", - "7046531 D134#1#chr03 83 138\n", - "\tIn path\n", - "\t 73306246 73306301\n", - "{'Q.START': 77, 'Q.END': 82, 'T.START': 73306238, 'T.END': 73306243, 'CG': '5='}\n", - "7046532 D134#1#chr03 138 139\n", - "\tNot in path\n", - "7046533 D134#1#chr03 139 202\n", - "\tIn path\n", - "\t 73306302 73306365\n", - "{'Q.START': 83, 'Q.END': 138, 'T.START': 73306246, 'T.END': 73306301, 'CG': '55='}\n", - "7046534 D134#1#chr03 202 203\n", - "\tIn path\n", - "\t 73306366 73306367\n", - "{'Q.START': 139, 'Q.END': 202, 'T.START': 73306302, 'T.END': 73306365, 'CG': '63='}\n", - "7046536 D134#1#chr03 203 379\n", - "\tIn path\n", - "\t 73306368 73306544\n", - "{'Q.START': 202, 'Q.END': 203, 'T.START': 73306366, 'T.END': 73306367, 'CG': '1='}\n", - "7046537 D134#1#chr03 379 380\n", - "\tIn path\n", - "\t 73306545 73306546\n", - "{'Q.START': 203, 'Q.END': 379, 'T.START': 73306368, 'T.END': 73306544, 'CG': '176='}\n", - "7046539 D134#1#chr03 380 429\n", - "\tIn path\n", - "\t 73306547 73306596\n", - "{'Q.START': 379, 'Q.END': 380, 'T.START': 73306545, 'T.END': 73306546, 'CG': '1='}\n", - "7046541 D134#1#chr03 429 430\n", - "\tIn path\n", - "\t 73306597 73306598\n", - "{'Q.START': 380, 'Q.END': 429, 'T.START': 73306547, 'T.END': 73306596, 'CG': '49='}\n", - "7046542 D134#1#chr03 430 457\n", - "\tIn path\n", - "\t 73306599 73306626\n", - "{'Q.START': 429, 'Q.END': 430, 'T.START': 73306597, 'T.END': 73306598, 'CG': '1='}\n", - "7046544 D134#1#chr03 457 492\n", - "\tIn path\n", - "\t 73306641 73306676\n", - "{'Q.START': 430, 'Q.END': 457, 'T.START': 73306599, 'T.END': 73306626, 'CG': '27='}\n", - "7046546 D134#1#chr03 492 494\n", - "\tNot in path\n", - "7046547 D134#1#chr03 494 497\n", - "\tNot in path\n", - "7046549 D134#1#chr03 497 507\n", - "\tNot in path\n", - "7046551 D134#1#chr03 507 508\n", - "\tNot in path\n", - "7046552 D134#1#chr03 508 564\n", - "\tIn path\n", - "\t 73306694 73306750\n", - "{'Q.START': 457, 'Q.END': 492, 'T.START': 73306641, 'T.END': 73306676, 'CG': '35='}\n", - "7046554 D134#1#chr03 564 566\n", - "\tNot in path\n", - "7046556 D134#1#chr03 568 569\n", - "\tIn path\n", - "\t 73306753 73306754\n", - "{'Q.START': 508, 'Q.END': 564, 'T.START': 73306694, 'T.END': 73306750, 'CG': '56='}\n", - "7046556 D134#1#chr03 568 569\n", - "\tIn path\n", - "\t 73306753 73306754\n", - "{'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}\n", - "7046556 D134#1#chr03 568 569\n", - "\tIn path\n", - "\t 73306753 73306754\n", - "{'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}\n", - "7046557 D134#1#chr03 569 824\n", - "\tIn path\n", - "\t 73306755 73307010\n", - "{'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}\n", - "7046558 D134#1#chr03 824 826\n", - "\tNot in path\n", - "7046559 D134#1#chr03 826 858\n", - "\tIn path\n", - "\t 73307011 73307043\n", - "{'Q.START': 569, 'Q.END': 824, 'T.START': 73306755, 'T.END': 73307010, 'CG': '255='}\n", - "7046560 D134#1#chr03 858 859\n", - "\tIn path\n", - "\t 73307044 73307045\n", - "{'Q.START': 826, 'Q.END': 858, 'T.START': 73307011, 'T.END': 73307043, 'CG': '32='}\n", - "7046561 D134#1#chr03 868 869\n", - "\tIn path\n", - "\t 73307046 73307047\n", - "{'Q.START': 858, 'Q.END': 859, 'T.START': 73307044, 'T.END': 73307045, 'CG': '1='}\n", - "7046561 D134#1#chr03 868 869\n", - "\tIn path\n", - "\t 73307046 73307047\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", - "7046561 D134#1#chr03 868 869\n", - "\tIn path\n", - "\t 73307046 73307047\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", - "7046561 D134#1#chr03 868 869\n", - "\tIn path\n", - "\t 73307046 73307047\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", - "7046561 D134#1#chr03 868 869\n", - "\tIn path\n", - "\t 73307046 73307047\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", - "7046561 D134#1#chr03 868 869\n", - "\tIn path\n", - "\t 73307046 73307047\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", - "7046561 D134#1#chr03 868 869\n", - "\tIn path\n", - "\t 73307046 73307047\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", - "7046561 D134#1#chr03 868 869\n", - "\tIn path\n", - "\t 73307046 73307047\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", - "7046561 D134#1#chr03 868 869\n", - "\tIn path\n", - "\t 73307046 73307047\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", - "7046561 D134#1#chr03 868 869\n", - "\tIn path\n", - "\t 73307046 73307047\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", - "7046562 D134#1#chr03 869 913\n", - "\tIn path\n", - "\t 73307048 73307092\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}\n", - "7046564 D134#1#chr03 913 919\n", - "\tIn path\n", - "\t 73307093 73307099\n", - "{'Q.START': 869, 'Q.END': 913, 'T.START': 73307048, 'T.END': 73307092, 'CG': '44='}\n", - "7046565 D134#1#chr03 919 978\n", - "\tIn path\n", - "\t 73307100 73307159\n", - "{'Q.START': 913, 'Q.END': 919, 'T.START': 73307093, 'T.END': 73307099, 'CG': '6='}\n", - "7046567 D134#1#chr03 978 979\n", - "\tIn path\n", - "\t 73307160 73307161\n", - "{'Q.START': 919, 'Q.END': 978, 'T.START': 73307100, 'T.END': 73307159, 'CG': '59='}\n", - "7046568 D134#1#chr03 979 1038\n", - "\tIn path\n", - "\t 73307162 73307221\n", - "{'Q.START': 978, 'Q.END': 979, 'T.START': 73307160, 'T.END': 73307161, 'CG': '1='}\n", - "7046570 D134#1#chr03 1038 1045\n", - "\tIn path\n", - "\t 73307224 73307231\n", - "{'Q.START': 979, 'Q.END': 1038, 'T.START': 73307162, 'T.END': 73307221, 'CG': '59='}\n", - "7046571 D134#1#chr03 1045 1046\n", - "\tIn path\n", - "\t 73307232 73307233\n", - "{'Q.START': 1038, 'Q.END': 1045, 'T.START': 73307224, 'T.END': 73307231, 'CG': '7='}\n", - "7046573 D134#1#chr03 1046 1080\n", - "\tIn path\n", - "\t 73307234 73307268\n", - "{'Q.START': 1045, 'Q.END': 1046, 'T.START': 73307232, 'T.END': 73307233, 'CG': '1='}\n", - "7046574 D134#1#chr03 1080 1081\n", - "\tIn path\n", - "\t 73307269 73307270\n", - "{'Q.START': 1046, 'Q.END': 1080, 'T.START': 73307234, 'T.END': 73307268, 'CG': '34='}\n", - "7046576 D134#1#chr03 1081 1107\n", - "\tIn path\n", - "\t 73307271 73307297\n", - "{'Q.START': 1080, 'Q.END': 1081, 'T.START': 73307269, 'T.END': 73307270, 'CG': '1='}\n", - "7046577 D134#1#chr03 1107 1108\n", - "\tNot in path\n", - "7046579 D134#1#chr03 1108 1183\n", - "\tIn path\n", - "\t 73307300 73307375\n", - "{'Q.START': 1081, 'Q.END': 1107, 'T.START': 73307271, 'T.END': 73307297, 'CG': '26='}\n", - "7046581 D134#1#chr03 1183 1186\n", - "\tIn path\n", - "\t 73307376 73307379\n", - "{'Q.START': 1108, 'Q.END': 1183, 'T.START': 73307300, 'T.END': 73307375, 'CG': '75='}\n", - "7046583 D134#1#chr03 1186 1224\n", - "\tNot in path\n", - "7046584 D134#1#chr03 1224 1257\n", - "\tIn path\n", - "\t 73307419 73307452\n", - "{'Q.START': 1183, 'Q.END': 1186, 'T.START': 73307376, 'T.END': 73307379, 'CG': '3='}\n", - "7046586 D134#1#chr03 1257 1289\n", - "\tNot in path\n", - "7046587 D134#1#chr03 1289 1311\n", - "\tIn path\n", - "\t 73307475 73307497\n", - "{'Q.START': 1224, 'Q.END': 1257, 'T.START': 73307419, 'T.END': 73307452, 'CG': '33='}\n", - "7046589 D134#1#chr03 1311 1359\n", - "\tNot in path\n", - "7046590 D134#1#chr03 1359 1382\n", - "\tIn path\n", - "\t 73307546 73307569\n", - "{'Q.START': 1289, 'Q.END': 1311, 'T.START': 73307475, 'T.END': 73307497, 'CG': '22='}\n", - "7046592 D134#1#chr03 1382 1434\n", - "\tNot in path\n", - "7046593 D134#1#chr03 1434 1451\n", - "\tIn path\n", - "\t 73307643 73307660\n", - "{'Q.START': 1359, 'Q.END': 1382, 'T.START': 73307546, 'T.END': 73307569, 'CG': '23='}\n", - "7046594 D134#1#chr03 1451 1531\n", - "\tIn path\n", - "\t 73307661 73307741\n", - "{'Q.START': 1434, 'Q.END': 1451, 'T.START': 73307643, 'T.END': 73307660, 'CG': '17='}\n", - "7046596 D134#1#chr03 1531 1532\n", - "\tNot in path\n", - "7046597 D134#1#chr03 1532 1543\n", - "\tIn path\n", - "\t 73307744 73307755\n", - "{'Q.START': 1451, 'Q.END': 1531, 'T.START': 73307661, 'T.END': 73307741, 'CG': '80='}\n", - "7046599 D134#1#chr03 1543 1544\n", - "\tNot in path\n", - "7046600 D134#1#chr03 1544 1572\n", - "\tIn path\n", - "\t 73307758 73307786\n", - "{'Q.START': 1532, 'Q.END': 1543, 'T.START': 73307744, 'T.END': 73307755, 'CG': '11='}\n", - "7046601 D134#1#chr03 1572 1573\n", - "\tIn path\n", - "\t 73307787 73307788\n", - "{'Q.START': 1544, 'Q.END': 1572, 'T.START': 73307758, 'T.END': 73307786, 'CG': '28='}\n", - "7046603 D134#1#chr03 1573 1587\n", - "\tIn path\n", - "\t 73307789 73307803\n", - "{'Q.START': 1572, 'Q.END': 1573, 'T.START': 73307787, 'T.END': 73307788, 'CG': '1='}\n", - "7046604 D134#1#chr03 1587 1588\n", - "\tNot in path\n", - "7046606 D134#1#chr03 1588 1616\n", - "\tIn path\n", - "\t 73307806 73307834\n", - "{'Q.START': 1573, 'Q.END': 1587, 'T.START': 73307789, 'T.END': 73307803, 'CG': '14='}\n", - "7046608 D134#1#chr03 1616 1617\n", - "\tIn path\n", - "\t 73307835 73307836\n", - "{'Q.START': 1588, 'Q.END': 1616, 'T.START': 73307806, 'T.END': 73307834, 'CG': '28='}\n", - "7046609 D134#1#chr03 1617 1646\n", - "\tIn path\n", - "\t 73307837 73307866\n", - "{'Q.START': 1616, 'Q.END': 1617, 'T.START': 73307835, 'T.END': 73307836, 'CG': '1='}\n", - "7046621 D134#1#chr03 1646 1661\n", - "\tIn path\n", - "\t 73307867 73307882\n", - "{'Q.START': 1617, 'Q.END': 1646, 'T.START': 73307837, 'T.END': 73307866, 'CG': '29='}\n", - "7046622 D134#1#chr03 1661 1673\n", - "\tIn path\n", - "\t 73307883 73307895\n", - "{'Q.START': 1646, 'Q.END': 1661, 'T.START': 73307867, 'T.END': 73307882, 'CG': '15='}\n", - "7046624 D134#1#chr03 1673 1674\n", - "\tIn path\n", - "\t 73307896 73307897\n", - "{'Q.START': 1661, 'Q.END': 1673, 'T.START': 73307883, 'T.END': 73307895, 'CG': '12='}\n", - "7046625 D134#1#chr03 1674 1726\n", - "\tIn path\n", - "\t 73307898 73307950\n", - "{'Q.START': 1673, 'Q.END': 1674, 'T.START': 73307896, 'T.END': 73307897, 'CG': '1='}\n", - "7046626 D134#1#chr03 1726 1727\n", - "\tNot in path\n", - "7046628 D134#1#chr03 1727 1762\n", - "\tIn path\n", - "\t 73307953 73307988\n", - "{'Q.START': 1674, 'Q.END': 1726, 'T.START': 73307898, 'T.END': 73307950, 'CG': '52='}\n", - "7046631 D134#1#chr03 1766 1767\n", - "\tIn path\n", - "\t 73307991 73307992\n", - "{'Q.START': 1727, 'Q.END': 1762, 'T.START': 73307953, 'T.END': 73307988, 'CG': '35='}\n", - "7046673 D134#1#chr03 1765 1766\n", - "\tIn path\n", - "\t 73307993 73307994\n", - "{'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}\n", - "7046631 D134#1#chr03 1766 1767\n", - "\tIn path\n", - "\t 73307991 73307992\n", - "{'Q.START': 1765, 'Q.END': 1766, 'T.START': 73307993, 'T.END': 73307994, 'CG': '1='}\n", - "7046673 D134#1#chr03 1765 1766\n", - "\tIn path\n", - "\t 73307993 73307994\n", - "{'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}\n", - "7046631 D134#1#chr03 1766 1767\n", - "\tIn path\n", - "\t 73307991 73307992\n", - "{'Q.START': 1765, 'Q.END': 1766, 'T.START': 73307993, 'T.END': 73307994, 'CG': '1='}\n", - "7046632 D134#1#chr03 1767 1824\n", - "\tIn path\n", - "\t 73307995 73308052\n", - "{'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}\n", - "7046634 D134#1#chr03 1824 1825\n", - "\tIn path\n", - "\t 73308053 73308054\n", - "{'Q.START': 1767, 'Q.END': 1824, 'T.START': 73307995, 'T.END': 73308052, 'CG': '57='}\n", - "7046635 D134#1#chr03 1825 1975\n", - "\tIn path\n", - "\t 73308055 73308205\n", - "{'Q.START': 1824, 'Q.END': 1825, 'T.START': 73308053, 'T.END': 73308054, 'CG': '1='}\n", - "7046637 D134#1#chr03 1975 1976\n", - "\tNot in path\n", - "7046638 D134#1#chr03 1976 2015\n", - "\tIn path\n", - "\t 73308208 73308247\n", - "{'Q.START': 1825, 'Q.END': 1975, 'T.START': 73308055, 'T.END': 73308205, 'CG': '150='}\n", - "7046639 D134#1#chr03 2015 2016\n", - "\tNot in path\n", - "7046641 D134#1#chr03 2016 2047\n", - "\tIn path\n", - "\t 73308250 73308281\n", - "{'Q.START': 1976, 'Q.END': 2015, 'T.START': 73308208, 'T.END': 73308247, 'CG': '39='}\n", - "7046644 D134#1#chr03 2047 2055\n", - "\tIn path\n", - "\t 73308286 73308294\n", - "{'Q.START': 2016, 'Q.END': 2047, 'T.START': 73308250, 'T.END': 73308281, 'CG': '31='}\n", - "7046646 D134#1#chr03 2055 2056\n", - "\tNot in path\n", - "7046647 D134#1#chr03 2056 2120\n", - "\tIn path\n", - "\t 73308297 73308361\n", - "{'Q.START': 2047, 'Q.END': 2055, 'T.START': 73308286, 'T.END': 73308294, 'CG': '8='}\n", - "7046649 D134#1#chr03 2120 2121\n", - "\tIn path\n", - "\t 73308362 73308363\n", - "{'Q.START': 2056, 'Q.END': 2120, 'T.START': 73308297, 'T.END': 73308361, 'CG': '64='}\n", - "7046650 D134#1#chr03 2121 2157\n", - "\tIn path\n", - "\t 73308364 73308400\n", - "{'Q.START': 2120, 'Q.END': 2121, 'T.START': 73308362, 'T.END': 73308363, 'CG': '1='}\n", - "7046652 D134#1#chr03 2157 2158\n", - "\tNot in path\n", - "7046653 D134#1#chr03 2158 2170\n", - "\tIn path\n", - "\t 73308403 73308415\n", - "{'Q.START': 2121, 'Q.END': 2157, 'T.START': 73308364, 'T.END': 73308400, 'CG': '36='}\n", - "7046654 D134#1#chr03 2170 2171\n", - "\tIn path\n", - "\t 73308416 73308417\n", - "{'Q.START': 2158, 'Q.END': 2170, 'T.START': 73308403, 'T.END': 73308415, 'CG': '12='}\n", - "7046656 D134#1#chr03 2171 2205\n", - "\tIn path\n", - "\t 73308418 73308452\n", - "{'Q.START': 2170, 'Q.END': 2171, 'T.START': 73308416, 'T.END': 73308417, 'CG': '1='}\n", - "7046657 D134#1#chr03 2205 2206\n", - "\tNot in path\n", - "7046659 D134#1#chr03 2206 2344\n", - "\tIn path\n", - "\t 73308455 73308593\n", - "{'Q.START': 2171, 'Q.END': 2205, 'T.START': 73308418, 'T.END': 73308452, 'CG': '34='}\n", - "7046660 D134#1#chr03 2344 2345\n", - "\tNot in path\n", - "7046662 D134#1#chr03 2345 2364\n", - "\tIn path\n", - "\t 73308596 73308615\n", - "{'Q.START': 2206, 'Q.END': 2344, 'T.START': 73308455, 'T.END': 73308593, 'CG': '138='}\n", - "7046663 D134#1#chr03 2364 2383\n", - "\tIn path\n", - "\t 73308616 73308635\n", - "{'Q.START': 2345, 'Q.END': 2364, 'T.START': 73308596, 'T.END': 73308615, 'CG': '19='}\n", - "7046665 D134#1#chr03 2383 2408\n", - "\tIn path\n", - "\t 73308636 73308661\n", - "{'Q.START': 2364, 'Q.END': 2383, 'T.START': 73308616, 'T.END': 73308635, 'CG': '19='}\n", - "7046667 D134#1#chr03 2408 2409\n", - "\tIn path\n", - "\t 73308662 73308663\n", - "{'Q.START': 2383, 'Q.END': 2408, 'T.START': 73308636, 'T.END': 73308661, 'CG': '25='}\n", - "7046668 D134#1#chr03 2409 2441\n", - "\tIn path\n", - "\t 73308664 73308696\n", - "{'Q.START': 2408, 'Q.END': 2409, 'T.START': 73308662, 'T.END': 73308663, 'CG': '1='}\n", - "7046670 D134#1#chr03 2441 2442\n", - "\tIn path\n", - "\t 73308697 73308698\n", - "{'Q.START': 2409, 'Q.END': 2441, 'T.START': 73308664, 'T.END': 73308696, 'CG': '32='}\n", - "7046671 D134#1#chr03 2442 2580\n", - "\tIn path\n", - "\t 73308699 73308837\n", - "{'Q.START': 2441, 'Q.END': 2442, 'T.START': 73308697, 'T.END': 73308698, 'CG': '1='}\n", - "7046674 D134#1#chr03 2582 2583\n", - "\tIn path\n", - "\t 73308838 73308839\n", - "{'Q.START': 2442, 'Q.END': 2580, 'T.START': 73308699, 'T.END': 73308837, 'CG': '138='}\n", - "7046675 D134#1#chr03 2583 2584\n", - "\tIn path\n", - "\t 73308840 73308841\n", - "{'Q.START': 2582, 'Q.END': 2583, 'T.START': 73308838, 'T.END': 73308839, 'CG': '1='}\n", - "7046674 D134#1#chr03 2582 2583\n", - "\tIn path\n", - "\t 73308838 73308839\n", - "{'Q.START': 2583, 'Q.END': 2584, 'T.START': 73308840, 'T.END': 73308841, 'CG': '1='}\n", - "7046675 D134#1#chr03 2583 2584\n", - "\tIn path\n", - "\t 73308840 73308841\n", - "{'Q.START': 2582, 'Q.END': 2583, 'T.START': 73308838, 'T.END': 73308839, 'CG': '1='}\n", - "7046676 D134#1#chr03 2584 2764\n", - "\tIn path\n", - "\t 73308842 73309022\n", - "{'Q.START': 2583, 'Q.END': 2584, 'T.START': 73308840, 'T.END': 73308841, 'CG': '1='}\n", - "7046678 D134#1#chr03 2764 2765\n", - "\tNot in path\n", - "7046679 D134#1#chr03 2765 2797\n", - "\tIn path\n", - "\t 73309025 73309057\n", - "{'Q.START': 2584, 'Q.END': 2764, 'T.START': 73308842, 'T.END': 73309022, 'CG': '180='}\n", - "7046680 D134#1#chr03 2797 2798\n", - "\tNot in path\n", - "7046682 D134#1#chr03 2798 2878\n", - "\tIn path\n", - "\t 73309060 73309140\n", - "{'Q.START': 2765, 'Q.END': 2797, 'T.START': 73309025, 'T.END': 73309057, 'CG': '32='}\n", - "7046684 D134#1#chr03 2878 2879\n", - "\tIn path\n", - "\t 73309141 73309142\n", - "{'Q.START': 2798, 'Q.END': 2878, 'T.START': 73309060, 'T.END': 73309140, 'CG': '80='}\n", - "7046685 D134#1#chr03 2879 2951\n", - "\tIn path\n", - "\t 73309143 73309215\n", - "{'Q.START': 2878, 'Q.END': 2879, 'T.START': 73309141, 'T.END': 73309142, 'CG': '1='}\n", - "7046686 D134#1#chr03 2951 2952\n", - "\tIn path\n", - "\t 73309216 73309217\n", - "{'Q.START': 2879, 'Q.END': 2951, 'T.START': 73309143, 'T.END': 73309215, 'CG': '72='}\n", - "7046688 D134#1#chr03 2952 3002\n", - "\tIn path\n", - "\t 73309218 73309268\n", - "{'Q.START': 2951, 'Q.END': 2952, 'T.START': 73309216, 'T.END': 73309217, 'CG': '1='}\n", - "7046690 D134#1#chr03 3002 3077\n", - "\tIn path\n", - "\t 73309271 73309346\n", - "{'Q.START': 2952, 'Q.END': 3002, 'T.START': 73309218, 'T.END': 73309268, 'CG': '50='}\n", - "7046692 D134#1#chr03 3077 3078\n", - "\tIn path\n", - "\t 73309347 73309348\n", - "{'Q.START': 3002, 'Q.END': 3077, 'T.START': 73309271, 'T.END': 73309346, 'CG': '75='}\n", - "7046693 D134#1#chr03 3078 3093\n", - "\tIn path\n", - "\t 73309349 73309364\n", - "{'Q.START': 3077, 'Q.END': 3078, 'T.START': 73309347, 'T.END': 73309348, 'CG': '1='}\n", - "7046695 D134#1#chr03 3093 3094\n", - "\tNot in path\n", - "7046696 D134#1#chr03 3094 3097\n", - "\tIn path\n", - "\t 73309367 73309370\n", - "{'Q.START': 3078, 'Q.END': 3093, 'T.START': 73309349, 'T.END': 73309364, 'CG': '15='}\n", - "7046698 D134#1#chr03 3097 3140\n", - "\tIn path\n", - "\t 73309371 73309414\n", - "{'Q.START': 3094, 'Q.END': 3097, 'T.START': 73309367, 'T.END': 73309370, 'CG': '3='}\n", - "7046700 D134#1#chr03 3140 3210\n", - "\tIn path\n", - "\t 73309415 73309485\n", - "{'Q.START': 3097, 'Q.END': 3140, 'T.START': 73309371, 'T.END': 73309414, 'CG': '43='}\n", - "7046702 D134#1#chr03 3210 3211\n", - "\tIn path\n", - "\t 73309486 73309487\n", - "{'Q.START': 3140, 'Q.END': 3210, 'T.START': 73309415, 'T.END': 73309485, 'CG': '70='}\n", - "7046703 D134#1#chr03 3211 3229\n", - "\tIn path\n", - "\t 73309488 73309506\n", - "{'Q.START': 3210, 'Q.END': 3211, 'T.START': 73309486, 'T.END': 73309487, 'CG': '1='}\n", - "7046704 D134#1#chr03 3229 3230\n", - "\tIn path\n", - "\t 73309507 73309508\n", - "{'Q.START': 3211, 'Q.END': 3229, 'T.START': 73309488, 'T.END': 73309506, 'CG': '18='}\n", - "7046706 D134#1#chr03 3230 3276\n", - "\tIn path\n", - "\t 73309509 73309555\n", - "{'Q.START': 3229, 'Q.END': 3230, 'T.START': 73309507, 'T.END': 73309508, 'CG': '1='}\n", - "7046707 D134#1#chr03 3276 3277\n", - "\tNot in path\n", - "7046709 D134#1#chr03 3277 3315\n", - "\tIn path\n", - "\t 73309558 73309596\n", - "{'Q.START': 3230, 'Q.END': 3276, 'T.START': 73309509, 'T.END': 73309555, 'CG': '46='}\n", - "7046710 D134#1#chr03 3315 3316\n", - "\tNot in path\n", - "7046712 D134#1#chr03 3316 3322\n", - "\tIn path\n", - "\t 73309599 73309605\n", - "{'Q.START': 3277, 'Q.END': 3315, 'T.START': 73309558, 'T.END': 73309596, 'CG': '38='}\n", - "7046713 D134#1#chr03 3322 3323\n", - "\tNot in path\n", - "7046715 D134#1#chr03 3323 3348\n", - "\tIn path\n", - "\t 73309608 73309633\n", - "{'Q.START': 3316, 'Q.END': 3322, 'T.START': 73309599, 'T.END': 73309605, 'CG': '6='}\n", - "7046718 D134#1#chr03 3352 3353\n", - "\tIn path\n", - "\t 73309634 73309635\n", - "{'Q.START': 3323, 'Q.END': 3348, 'T.START': 73309608, 'T.END': 73309633, 'CG': '25='}\n", - "7046717 D134#1#chr03 3351 3352\n", - "\tIn path\n", - "\t 73309636 73309637\n", - "{'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}\n", - "7046718 D134#1#chr03 3352 3353\n", - "\tIn path\n", - "\t 73309634 73309635\n", - "{'Q.START': 3351, 'Q.END': 3352, 'T.START': 73309636, 'T.END': 73309637, 'CG': '1='}\n", - "7046717 D134#1#chr03 3351 3352\n", - "\tIn path\n", - "\t 73309636 73309637\n", - "{'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}\n", - "7046718 D134#1#chr03 3352 3353\n", - "\tIn path\n", - "\t 73309634 73309635\n", - "{'Q.START': 3351, 'Q.END': 3352, 'T.START': 73309636, 'T.END': 73309637, 'CG': '1='}\n", - "7046720 D134#1#chr03 3353 3354\n", - "\tIn path\n", - "\t 73309638 73309639\n", - "{'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}\n", - "7046722 D134#1#chr03 3354 3356\n", - "\tIn path\n", - "\t 73309640 73309642\n", - "{'Q.START': 3353, 'Q.END': 3354, 'T.START': 73309638, 'T.END': 73309639, 'CG': '1='}\n", - "7046724 D134#1#chr03 3356 3357\n", - "\tNot in path\n", - "7046725 D134#1#chr03 3357 3489\n", - "\tIn path\n", - "\t 73309645 73309777\n", - "{'Q.START': 3354, 'Q.END': 3356, 'T.START': 73309640, 'T.END': 73309642, 'CG': '2='}\n", - "7046727 D134#1#chr03 3489 3490\n", - "\tNot in path\n", - "7046728 D134#1#chr03 3490 3642\n", - "\tIn path\n", - "\t 73309780 73309932\n", - "{'Q.START': 3357, 'Q.END': 3489, 'T.START': 73309645, 'T.END': 73309777, 'CG': '132='}\n", - "7046729 D134#1#chr03 3642 3644\n", - "\tNot in path\n", - "7046730 D134#1#chr03 3644 3685\n", - "\tIn path\n", - "\t 73309933 73309974\n", - "{'Q.START': 3490, 'Q.END': 3642, 'T.START': 73309780, 'T.END': 73309932, 'CG': '152='}\n", - "7046731 D134#1#chr03 3685 3687\n", - "\tNot in path\n", - "7046733 D134#1#chr03 3687 3693\n", - "\tIn path\n", - "\t 73309977 73309983\n", - "{'Q.START': 3644, 'Q.END': 3685, 'T.START': 73309933, 'T.END': 73309974, 'CG': '41='}\n", - "7046735 D134#1#chr03 3693 3694\n", - "\tNot in path\n", - "7046736 D134#1#chr03 3694 3708\n", - "\tIn path\n", - "\t 73309986 73310000\n", - "{'Q.START': 3687, 'Q.END': 3693, 'T.START': 73309977, 'T.END': 73309983, 'CG': '6='}\n", - "7046738 D134#1#chr03 3720 3721\n", - "\tIn path\n", - "\t 73310010 73310011\n", - "{'Q.START': 3694, 'Q.END': 3708, 'T.START': 73309986, 'T.END': 73310000, 'CG': '14='}\n", - "7046739 D134#1#chr03 3721 3722\n", - "\tIn path\n", - "\t 73310003 73310004\n", - "{'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}\n", - "7046740 D134#1#chr03 3716 3720\n", - "\tIn path\n", - "\t 73310005 73310009\n", - "{'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}\n", - "7046738 D134#1#chr03 3720 3721\n", - "\tIn path\n", - "\t 73310010 73310011\n", - "{'Q.START': 3716, 'Q.END': 3720, 'T.START': 73310005, 'T.END': 73310009, 'CG': '4='}\n", - "7046739 D134#1#chr03 3721 3722\n", - "\tIn path\n", - "\t 73310003 73310004\n", - "{'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}\n", - "7046740 D134#1#chr03 3716 3720\n", - "\tIn path\n", - "\t 73310005 73310009\n", - "{'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}\n", - "7046738 D134#1#chr03 3720 3721\n", - "\tIn path\n", - "\t 73310010 73310011\n", - "{'Q.START': 3716, 'Q.END': 3720, 'T.START': 73310005, 'T.END': 73310009, 'CG': '4='}\n", - "7046739 D134#1#chr03 3721 3722\n", - "\tIn path\n", - "\t 73310003 73310004\n", - "{'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}\n", - "7046741 D134#1#chr03 3722 3735\n", - "\tIn path\n", - "\t 73310012 73310045\n", - "{'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}\n", - "ALN_1\n", - "7046526 TO1000#1#chr03 0 77\n", - "\t 64684013 64684090\n", - "skipped\n", - "\n", - "7046528 TO1000#1#chr03 77 82\n", - "\t 64684091 64684096\n", - "{'Q.START': 0, 'Q.END': 77, 'T.START': 64684013, 'T.END': 64684090, 'CG': '77='}\n", - "7046530 TO1000#1#chr03 82 83\n", - "\t 64684097 64684098\n", - "{'Q.START': 77, 'Q.END': 82, 'T.START': 64684091, 'T.END': 64684096, 'CG': '5='}\n", - "7046531 TO1000#1#chr03 83 138\n", - "\t 64684099 64684154\n", - "{'Q.START': 82, 'Q.END': 83, 'T.START': 64684097, 'T.END': 64684098, 'CG': '1='}\n", - "7046532 TO1000#1#chr03 138 139\n", - "\t 64684155 64684156\n", - "{'Q.START': 83, 'Q.END': 138, 'T.START': 64684099, 'T.END': 64684154, 'CG': '55='}\n", - "7046533 TO1000#1#chr03 139 202\n", - "\t 64684157 64684220\n", - "{'Q.START': 138, 'Q.END': 139, 'T.START': 64684155, 'T.END': 64684156, 'CG': '1='}\n", - "7046534 TO1000#1#chr03 202 203\n", - "\t 64684221 64684222\n", - "{'Q.START': 139, 'Q.END': 202, 'T.START': 64684157, 'T.END': 64684220, 'CG': '63='}\n", - "7046536 TO1000#1#chr03 203 379\n", - "\t 64684223 64684399\n", - "{'Q.START': 202, 'Q.END': 203, 'T.START': 64684221, 'T.END': 64684222, 'CG': '1='}\n", - "7046537 TO1000#1#chr03 379 380\n", - "\t 64684400 64684401\n", - "{'Q.START': 203, 'Q.END': 379, 'T.START': 64684223, 'T.END': 64684399, 'CG': '176='}\n", - "7046539 TO1000#1#chr03 380 429\n", - "\t 64684402 64684451\n", - "{'Q.START': 379, 'Q.END': 380, 'T.START': 64684400, 'T.END': 64684401, 'CG': '1='}\n", - "7046541 TO1000#1#chr03 429 430\n", - "\t 64684452 64684453\n", - "{'Q.START': 380, 'Q.END': 429, 'T.START': 64684402, 'T.END': 64684451, 'CG': '49='}\n", - "7046542 TO1000#1#chr03 430 457\n", - "\t 64684454 64684481\n", - "{'Q.START': 429, 'Q.END': 430, 'T.START': 64684452, 'T.END': 64684453, 'CG': '1='}\n", - "7046544 TO1000#1#chr03 457 492\n", - "\t 64684482 64684517\n", - "{'Q.START': 430, 'Q.END': 457, 'T.START': 64684454, 'T.END': 64684481, 'CG': '27='}\n", - "7046546 TO1000#1#chr03 492 494\n", - "\t 64684518 64684520\n", - "{'Q.START': 457, 'Q.END': 492, 'T.START': 64684482, 'T.END': 64684517, 'CG': '35='}\n", - "7046547 TO1000#1#chr03 494 497\n", - "\t 64684521 64684524\n", - "{'Q.START': 492, 'Q.END': 494, 'T.START': 64684518, 'T.END': 64684520, 'CG': '2='}\n", - "7046549 TO1000#1#chr03 497 507\n", - "\t 64684525 64684535\n", - "{'Q.START': 494, 'Q.END': 497, 'T.START': 64684521, 'T.END': 64684524, 'CG': '3='}\n", - "7046551 TO1000#1#chr03 507 508\n", - "\t 64684536 64684537\n", - "{'Q.START': 497, 'Q.END': 507, 'T.START': 64684525, 'T.END': 64684535, 'CG': '10='}\n", - "7046552 TO1000#1#chr03 508 564\n", - "\t 64684538 64684594\n", - "{'Q.START': 507, 'Q.END': 508, 'T.START': 64684536, 'T.END': 64684537, 'CG': '1='}\n", - "7046554 TO1000#1#chr03 564 566\n", - "\t 64684595 64684597\n", - "{'Q.START': 508, 'Q.END': 564, 'T.START': 64684538, 'T.END': 64684594, 'CG': '56='}\n", - "7046556 TO1000#1#chr03 568 569\n", - "\t 64684598 64684599\n", - "{'Q.START': 564, 'Q.END': 566, 'T.START': 64684595, 'T.END': 64684597, 'CG': '2='}\n", - "7046556 TO1000#1#chr03 568 569\n", - "\t 64684598 64684599\n", - "{'Q.START': 568, 'Q.END': 569, 'T.START': 64684598, 'T.END': 64684599, 'CG': '1='}\n", - "7046556 TO1000#1#chr03 568 569\n", - "\t 64684598 64684599\n", - "{'Q.START': 568, 'Q.END': 569, 'T.START': 64684598, 'T.END': 64684599, 'CG': '1='}\n", - "7046557 TO1000#1#chr03 569 824\n", - "\t 64684600 64684855\n", - "{'Q.START': 568, 'Q.END': 569, 'T.START': 64684598, 'T.END': 64684599, 'CG': '1='}\n", - "7046558 TO1000#1#chr03 824 826\n", - "\t 64684856 64684858\n", - "{'Q.START': 569, 'Q.END': 824, 'T.START': 64684600, 'T.END': 64684855, 'CG': '255='}\n", - "7046559 TO1000#1#chr03 826 858\n", - "\t 64684859 64684891\n", - "{'Q.START': 824, 'Q.END': 826, 'T.START': 64684856, 'T.END': 64684858, 'CG': '2='}\n", - "7046560 TO1000#1#chr03 858 859\n", - "\t 64684892 64684893\n", - "{'Q.START': 826, 'Q.END': 858, 'T.START': 64684859, 'T.END': 64684891, 'CG': '32='}\n", - "7046561 TO1000#1#chr03 868 869\n", - "\t 64684894 64684895\n", - "{'Q.START': 858, 'Q.END': 859, 'T.START': 64684892, 'T.END': 64684893, 'CG': '1='}\n", - "7046561 TO1000#1#chr03 868 869\n", - "\t 64684894 64684895\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", - "7046561 TO1000#1#chr03 868 869\n", - "\t 64684894 64684895\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", - "7046561 TO1000#1#chr03 868 869\n", - "\t 64684894 64684895\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", - "7046561 TO1000#1#chr03 868 869\n", - "\t 64684894 64684895\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", - "7046561 TO1000#1#chr03 868 869\n", - "\t 64684894 64684895\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", - "7046561 TO1000#1#chr03 868 869\n", - "\t 64684894 64684895\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", - "7046561 TO1000#1#chr03 868 869\n", - "\t 64684894 64684895\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", - "7046561 TO1000#1#chr03 868 869\n", - "\t 64684894 64684895\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", - "7046561 TO1000#1#chr03 868 869\n", - "\t 64684894 64684895\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", - "7046562 TO1000#1#chr03 869 913\n", - "\t 64684896 64684940\n", - "{'Q.START': 868, 'Q.END': 869, 'T.START': 64684894, 'T.END': 64684895, 'CG': '1='}\n", - "7046564 TO1000#1#chr03 913 919\n", - "\t 64684941 64684947\n", - "{'Q.START': 869, 'Q.END': 913, 'T.START': 64684896, 'T.END': 64684940, 'CG': '44='}\n", - "7046565 TO1000#1#chr03 919 978\n", - "\t 64684948 64685007\n", - "{'Q.START': 913, 'Q.END': 919, 'T.START': 64684941, 'T.END': 64684947, 'CG': '6='}\n", - "7046567 TO1000#1#chr03 978 979\n", - "\t 64685008 64685009\n", - "{'Q.START': 919, 'Q.END': 978, 'T.START': 64684948, 'T.END': 64685007, 'CG': '59='}\n", - "7046568 TO1000#1#chr03 979 1038\n", - "\t 64685010 64685069\n", - "{'Q.START': 978, 'Q.END': 979, 'T.START': 64685008, 'T.END': 64685009, 'CG': '1='}\n", - "7046570 TO1000#1#chr03 1038 1045\n", - "\t 64685070 64685077\n", - "{'Q.START': 979, 'Q.END': 1038, 'T.START': 64685010, 'T.END': 64685069, 'CG': '59='}\n", - "7046571 TO1000#1#chr03 1045 1046\n", - "\t 64685078 64685079\n", - "{'Q.START': 1038, 'Q.END': 1045, 'T.START': 64685070, 'T.END': 64685077, 'CG': '7='}\n", - "7046573 TO1000#1#chr03 1046 1080\n", - "\t 64685080 64685114\n", - "{'Q.START': 1045, 'Q.END': 1046, 'T.START': 64685078, 'T.END': 64685079, 'CG': '1='}\n", - "7046574 TO1000#1#chr03 1080 1081\n", - "\t 64685115 64685116\n", - "{'Q.START': 1046, 'Q.END': 1080, 'T.START': 64685080, 'T.END': 64685114, 'CG': '34='}\n", - "7046576 TO1000#1#chr03 1081 1107\n", - "\t 64685117 64685143\n", - "{'Q.START': 1080, 'Q.END': 1081, 'T.START': 64685115, 'T.END': 64685116, 'CG': '1='}\n", - "7046577 TO1000#1#chr03 1107 1108\n", - "\t 64685144 64685145\n", - "{'Q.START': 1081, 'Q.END': 1107, 'T.START': 64685117, 'T.END': 64685143, 'CG': '26='}\n", - "7046579 TO1000#1#chr03 1108 1183\n", - "\t 64685146 64685221\n", - "{'Q.START': 1107, 'Q.END': 1108, 'T.START': 64685144, 'T.END': 64685145, 'CG': '1='}\n", - "7046581 TO1000#1#chr03 1183 1186\n", - "\t 64685222 64685225\n", - "{'Q.START': 1108, 'Q.END': 1183, 'T.START': 64685146, 'T.END': 64685221, 'CG': '75='}\n", - "7046583 TO1000#1#chr03 1186 1224\n", - "\t 64685226 64685264\n", - "{'Q.START': 1183, 'Q.END': 1186, 'T.START': 64685222, 'T.END': 64685225, 'CG': '3='}\n", - "7046584 TO1000#1#chr03 1224 1257\n", - "\t 64685265 64685298\n", - "{'Q.START': 1186, 'Q.END': 1224, 'T.START': 64685226, 'T.END': 64685264, 'CG': '38='}\n", - "7046586 TO1000#1#chr03 1257 1289\n", - "\t 64685299 64685331\n", - "{'Q.START': 1224, 'Q.END': 1257, 'T.START': 64685265, 'T.END': 64685298, 'CG': '33='}\n", - "7046587 TO1000#1#chr03 1289 1311\n", - "\t 64685332 64685354\n", - "{'Q.START': 1257, 'Q.END': 1289, 'T.START': 64685299, 'T.END': 64685331, 'CG': '32='}\n", - "7046589 TO1000#1#chr03 1311 1359\n", - "\t 64685355 64685403\n", - "{'Q.START': 1289, 'Q.END': 1311, 'T.START': 64685332, 'T.END': 64685354, 'CG': '22='}\n", - "7046590 TO1000#1#chr03 1359 1382\n", - "\t 64685404 64685427\n", - "{'Q.START': 1311, 'Q.END': 1359, 'T.START': 64685355, 'T.END': 64685403, 'CG': '48='}\n", - "7046592 TO1000#1#chr03 1382 1434\n", - "\t 64685428 64685480\n", - "{'Q.START': 1359, 'Q.END': 1382, 'T.START': 64685404, 'T.END': 64685427, 'CG': '23='}\n", - "7046593 TO1000#1#chr03 1434 1451\n", - "\t 64685481 64685498\n", - "{'Q.START': 1382, 'Q.END': 1434, 'T.START': 64685428, 'T.END': 64685480, 'CG': '52='}\n", - "7046594 TO1000#1#chr03 1451 1531\n", - "\t 64685499 64685579\n", - "{'Q.START': 1434, 'Q.END': 1451, 'T.START': 64685481, 'T.END': 64685498, 'CG': '17='}\n", - "7046596 TO1000#1#chr03 1531 1532\n", - "\t 64685580 64685581\n", - "{'Q.START': 1451, 'Q.END': 1531, 'T.START': 64685499, 'T.END': 64685579, 'CG': '80='}\n", - "7046597 TO1000#1#chr03 1532 1543\n", - "\t 64685582 64685593\n", - "{'Q.START': 1531, 'Q.END': 1532, 'T.START': 64685580, 'T.END': 64685581, 'CG': '1='}\n", - "7046599 TO1000#1#chr03 1543 1544\n", - "\t 64685594 64685595\n", - "{'Q.START': 1532, 'Q.END': 1543, 'T.START': 64685582, 'T.END': 64685593, 'CG': '11='}\n", - "7046600 TO1000#1#chr03 1544 1572\n", - "\t 64685596 64685624\n", - "{'Q.START': 1543, 'Q.END': 1544, 'T.START': 64685594, 'T.END': 64685595, 'CG': '1='}\n", - "7046601 TO1000#1#chr03 1572 1573\n", - "\t 64685625 64685626\n", - "{'Q.START': 1544, 'Q.END': 1572, 'T.START': 64685596, 'T.END': 64685624, 'CG': '28='}\n", - "7046603 TO1000#1#chr03 1573 1587\n", - "\t 64685627 64685641\n", - "{'Q.START': 1572, 'Q.END': 1573, 'T.START': 64685625, 'T.END': 64685626, 'CG': '1='}\n", - "7046604 TO1000#1#chr03 1587 1588\n", - "\t 64685642 64685643\n", - "{'Q.START': 1573, 'Q.END': 1587, 'T.START': 64685627, 'T.END': 64685641, 'CG': '14='}\n", - "7046606 TO1000#1#chr03 1588 1616\n", - "\t 64685644 64685672\n", - "{'Q.START': 1587, 'Q.END': 1588, 'T.START': 64685642, 'T.END': 64685643, 'CG': '1='}\n", - "7046608 TO1000#1#chr03 1616 1617\n", - "\t 64685673 64685674\n", - "{'Q.START': 1588, 'Q.END': 1616, 'T.START': 64685644, 'T.END': 64685672, 'CG': '28='}\n", - "7046609 TO1000#1#chr03 1617 1646\n", - "\t 64685675 64685704\n", - "{'Q.START': 1616, 'Q.END': 1617, 'T.START': 64685673, 'T.END': 64685674, 'CG': '1='}\n", - "7046621 TO1000#1#chr03 1646 1661\n", - "\t 64685705 64685720\n", - "{'Q.START': 1617, 'Q.END': 1646, 'T.START': 64685675, 'T.END': 64685704, 'CG': '29='}\n", - "7046622 TO1000#1#chr03 1661 1673\n", - "\t 64685721 64685733\n", - "{'Q.START': 1646, 'Q.END': 1661, 'T.START': 64685705, 'T.END': 64685720, 'CG': '15='}\n", - "7046624 TO1000#1#chr03 1673 1674\n", - "\t 64685734 64685735\n", - "{'Q.START': 1661, 'Q.END': 1673, 'T.START': 64685721, 'T.END': 64685733, 'CG': '12='}\n", - "7046625 TO1000#1#chr03 1674 1726\n", - "\t 64685736 64685788\n", - "{'Q.START': 1673, 'Q.END': 1674, 'T.START': 64685734, 'T.END': 64685735, 'CG': '1='}\n", - "7046626 TO1000#1#chr03 1726 1727\n", - "\t 64685789 64685790\n", - "{'Q.START': 1674, 'Q.END': 1726, 'T.START': 64685736, 'T.END': 64685788, 'CG': '52='}\n", - "7046628 TO1000#1#chr03 1727 1762\n", - "\t 64685791 64685826\n", - "{'Q.START': 1726, 'Q.END': 1727, 'T.START': 64685789, 'T.END': 64685790, 'CG': '1='}\n", - "7046631 TO1000#1#chr03 1766 1767\n", - "\t 64685827 64685828\n", - "{'Q.START': 1727, 'Q.END': 1762, 'T.START': 64685791, 'T.END': 64685826, 'CG': '35='}\n", - "7046673 TO1000#1#chr03 1765 1766\n", - "\t 64685829 64685830\n", - "{'Q.START': 1766, 'Q.END': 1767, 'T.START': 64685827, 'T.END': 64685828, 'CG': '1='}\n", - "7046631 TO1000#1#chr03 1766 1767\n", - "\t 64685827 64685828\n", - "{'Q.START': 1765, 'Q.END': 1766, 'T.START': 64685829, 'T.END': 64685830, 'CG': '1='}\n", - "7046673 TO1000#1#chr03 1765 1766\n", - "\t 64685829 64685830\n", - "{'Q.START': 1766, 'Q.END': 1767, 'T.START': 64685827, 'T.END': 64685828, 'CG': '1='}\n", - "7046631 TO1000#1#chr03 1766 1767\n", - "\t 64685827 64685828\n", - "{'Q.START': 1765, 'Q.END': 1766, 'T.START': 64685829, 'T.END': 64685830, 'CG': '1='}\n", - "7046632 TO1000#1#chr03 1767 1824\n", - "\t 64685831 64685888\n", - "{'Q.START': 1766, 'Q.END': 1767, 'T.START': 64685827, 'T.END': 64685828, 'CG': '1='}\n", - "7046634 TO1000#1#chr03 1824 1825\n", - "\t 64685889 64685890\n", - "{'Q.START': 1767, 'Q.END': 1824, 'T.START': 64685831, 'T.END': 64685888, 'CG': '57='}\n", - "7046635 TO1000#1#chr03 1825 1975\n", - "\t 64685891 64686041\n", - "{'Q.START': 1824, 'Q.END': 1825, 'T.START': 64685889, 'T.END': 64685890, 'CG': '1='}\n", - "7046637 TO1000#1#chr03 1975 1976\n", - "\t 64686042 64686043\n", - "{'Q.START': 1825, 'Q.END': 1975, 'T.START': 64685891, 'T.END': 64686041, 'CG': '150='}\n", - "7046638 TO1000#1#chr03 1976 2015\n", - "\t 64686044 64686083\n", - "{'Q.START': 1975, 'Q.END': 1976, 'T.START': 64686042, 'T.END': 64686043, 'CG': '1='}\n", - "7046639 TO1000#1#chr03 2015 2016\n", - "\t 64686084 64686085\n", - "{'Q.START': 1976, 'Q.END': 2015, 'T.START': 64686044, 'T.END': 64686083, 'CG': '39='}\n", - "7046641 TO1000#1#chr03 2016 2047\n", - "\t 64686086 64686117\n", - "{'Q.START': 2015, 'Q.END': 2016, 'T.START': 64686084, 'T.END': 64686085, 'CG': '1='}\n", - "7046644 TO1000#1#chr03 2047 2055\n", - "\t 64686118 64686126\n", - "{'Q.START': 2016, 'Q.END': 2047, 'T.START': 64686086, 'T.END': 64686117, 'CG': '31='}\n", - "7046646 TO1000#1#chr03 2055 2056\n", - "\t 64686127 64686128\n", - "{'Q.START': 2047, 'Q.END': 2055, 'T.START': 64686118, 'T.END': 64686126, 'CG': '8='}\n", - "7046647 TO1000#1#chr03 2056 2120\n", - "\t 64686129 64686193\n", - "{'Q.START': 2055, 'Q.END': 2056, 'T.START': 64686127, 'T.END': 64686128, 'CG': '1='}\n", - "7046649 TO1000#1#chr03 2120 2121\n", - "\t 64686194 64686195\n", - "{'Q.START': 2056, 'Q.END': 2120, 'T.START': 64686129, 'T.END': 64686193, 'CG': '64='}\n", - "7046650 TO1000#1#chr03 2121 2157\n", - "\t 64686196 64686232\n", - "{'Q.START': 2120, 'Q.END': 2121, 'T.START': 64686194, 'T.END': 64686195, 'CG': '1='}\n", - "7046652 TO1000#1#chr03 2157 2158\n", - "\t 64686233 64686234\n", - "{'Q.START': 2121, 'Q.END': 2157, 'T.START': 64686196, 'T.END': 64686232, 'CG': '36='}\n", - "7046653 TO1000#1#chr03 2158 2170\n", - "\t 64686235 64686247\n", - "{'Q.START': 2157, 'Q.END': 2158, 'T.START': 64686233, 'T.END': 64686234, 'CG': '1='}\n", - "7046654 TO1000#1#chr03 2170 2171\n", - "\t 64686248 64686249\n", - "{'Q.START': 2158, 'Q.END': 2170, 'T.START': 64686235, 'T.END': 64686247, 'CG': '12='}\n", - "7046656 TO1000#1#chr03 2171 2205\n", - "\t 64686250 64686284\n", - "{'Q.START': 2170, 'Q.END': 2171, 'T.START': 64686248, 'T.END': 64686249, 'CG': '1='}\n", - "7046657 TO1000#1#chr03 2205 2206\n", - "\t 64686285 64686286\n", - "{'Q.START': 2171, 'Q.END': 2205, 'T.START': 64686250, 'T.END': 64686284, 'CG': '34='}\n", - "7046659 TO1000#1#chr03 2206 2344\n", - "\t 64686287 64686425\n", - "{'Q.START': 2205, 'Q.END': 2206, 'T.START': 64686285, 'T.END': 64686286, 'CG': '1='}\n", - "7046660 TO1000#1#chr03 2344 2345\n", - "\t 64686426 64686427\n", - "{'Q.START': 2206, 'Q.END': 2344, 'T.START': 64686287, 'T.END': 64686425, 'CG': '138='}\n", - "7046662 TO1000#1#chr03 2345 2364\n", - "\t 64686428 64686447\n", - "{'Q.START': 2344, 'Q.END': 2345, 'T.START': 64686426, 'T.END': 64686427, 'CG': '1='}\n", - "7046663 TO1000#1#chr03 2364 2383\n", - "\t 64686448 64686467\n", - "{'Q.START': 2345, 'Q.END': 2364, 'T.START': 64686428, 'T.END': 64686447, 'CG': '19='}\n", - "7046665 TO1000#1#chr03 2383 2408\n", - "\t 64686468 64686493\n", - "{'Q.START': 2364, 'Q.END': 2383, 'T.START': 64686448, 'T.END': 64686467, 'CG': '19='}\n", - "7046667 TO1000#1#chr03 2408 2409\n", - "\t 64686494 64686495\n", - "{'Q.START': 2383, 'Q.END': 2408, 'T.START': 64686468, 'T.END': 64686493, 'CG': '25='}\n", - "7046668 TO1000#1#chr03 2409 2441\n", - "\t 64686496 64686528\n", - "{'Q.START': 2408, 'Q.END': 2409, 'T.START': 64686494, 'T.END': 64686495, 'CG': '1='}\n", - "7046670 TO1000#1#chr03 2441 2442\n", - "\t 64686529 64686530\n", - "{'Q.START': 2409, 'Q.END': 2441, 'T.START': 64686496, 'T.END': 64686528, 'CG': '32='}\n", - "7046671 TO1000#1#chr03 2442 2580\n", - "\t 64686531 64686669\n", - "{'Q.START': 2441, 'Q.END': 2442, 'T.START': 64686529, 'T.END': 64686530, 'CG': '1='}\n", - "7046674 TO1000#1#chr03 2582 2583\n", - "\t 64686670 64686671\n", - "{'Q.START': 2442, 'Q.END': 2580, 'T.START': 64686531, 'T.END': 64686669, 'CG': '138='}\n", - "7046675 TO1000#1#chr03 2583 2584\n", - "\t 64686672 64686673\n", - "{'Q.START': 2582, 'Q.END': 2583, 'T.START': 64686670, 'T.END': 64686671, 'CG': '1='}\n", - "7046674 TO1000#1#chr03 2582 2583\n", - "\t 64686670 64686671\n", - "{'Q.START': 2583, 'Q.END': 2584, 'T.START': 64686672, 'T.END': 64686673, 'CG': '1='}\n", - "7046675 TO1000#1#chr03 2583 2584\n", - "\t 64686672 64686673\n", - "{'Q.START': 2582, 'Q.END': 2583, 'T.START': 64686670, 'T.END': 64686671, 'CG': '1='}\n", - "7046676 TO1000#1#chr03 2584 2764\n", - "\t 64686674 64686854\n", - "{'Q.START': 2583, 'Q.END': 2584, 'T.START': 64686672, 'T.END': 64686673, 'CG': '1='}\n", - "7046678 TO1000#1#chr03 2764 2765\n", - "\t 64686855 64686856\n", - "{'Q.START': 2584, 'Q.END': 2764, 'T.START': 64686674, 'T.END': 64686854, 'CG': '180='}\n", - "7046679 TO1000#1#chr03 2765 2797\n", - "\t 64686857 64686889\n", - "{'Q.START': 2764, 'Q.END': 2765, 'T.START': 64686855, 'T.END': 64686856, 'CG': '1='}\n", - "7046680 TO1000#1#chr03 2797 2798\n", - "\t 64686890 64686891\n", - "{'Q.START': 2765, 'Q.END': 2797, 'T.START': 64686857, 'T.END': 64686889, 'CG': '32='}\n", - "7046682 TO1000#1#chr03 2798 2878\n", - "\t 64686892 64686972\n", - "{'Q.START': 2797, 'Q.END': 2798, 'T.START': 64686890, 'T.END': 64686891, 'CG': '1='}\n", - "7046684 TO1000#1#chr03 2878 2879\n", - "\t 64686973 64686974\n", - "{'Q.START': 2798, 'Q.END': 2878, 'T.START': 64686892, 'T.END': 64686972, 'CG': '80='}\n", - "7046685 TO1000#1#chr03 2879 2951\n", - "\t 64686975 64687047\n", - "{'Q.START': 2878, 'Q.END': 2879, 'T.START': 64686973, 'T.END': 64686974, 'CG': '1='}\n", - "7046686 TO1000#1#chr03 2951 2952\n", - "\t 64687048 64687049\n", - "{'Q.START': 2879, 'Q.END': 2951, 'T.START': 64686975, 'T.END': 64687047, 'CG': '72='}\n", - "7046688 TO1000#1#chr03 2952 3002\n", - "\t 64687050 64687100\n", - "{'Q.START': 2951, 'Q.END': 2952, 'T.START': 64687048, 'T.END': 64687049, 'CG': '1='}\n", - "7046690 TO1000#1#chr03 3002 3077\n", - "\t 64687101 64687176\n", - "{'Q.START': 2952, 'Q.END': 3002, 'T.START': 64687050, 'T.END': 64687100, 'CG': '50='}\n", - "7046692 TO1000#1#chr03 3077 3078\n", - "\t 64687177 64687178\n", - "{'Q.START': 3002, 'Q.END': 3077, 'T.START': 64687101, 'T.END': 64687176, 'CG': '75='}\n", - "7046693 TO1000#1#chr03 3078 3093\n", - "\t 64687179 64687194\n", - "{'Q.START': 3077, 'Q.END': 3078, 'T.START': 64687177, 'T.END': 64687178, 'CG': '1='}\n", - "7046695 TO1000#1#chr03 3093 3094\n", - "\t 64687195 64687196\n", - "{'Q.START': 3078, 'Q.END': 3093, 'T.START': 64687179, 'T.END': 64687194, 'CG': '15='}\n", - "7046696 TO1000#1#chr03 3094 3097\n", - "\t 64687197 64687200\n", - "{'Q.START': 3093, 'Q.END': 3094, 'T.START': 64687195, 'T.END': 64687196, 'CG': '1='}\n", - "7046698 TO1000#1#chr03 3097 3140\n", - "\t 64687201 64687244\n", - "{'Q.START': 3094, 'Q.END': 3097, 'T.START': 64687197, 'T.END': 64687200, 'CG': '3='}\n", - "7046700 TO1000#1#chr03 3140 3210\n", - "\t 64687245 64687315\n", - "{'Q.START': 3097, 'Q.END': 3140, 'T.START': 64687201, 'T.END': 64687244, 'CG': '43='}\n", - "7046702 TO1000#1#chr03 3210 3211\n", - "\t 64687316 64687317\n", - "{'Q.START': 3140, 'Q.END': 3210, 'T.START': 64687245, 'T.END': 64687315, 'CG': '70='}\n", - "7046703 TO1000#1#chr03 3211 3229\n", - "\t 64687318 64687336\n", - "{'Q.START': 3210, 'Q.END': 3211, 'T.START': 64687316, 'T.END': 64687317, 'CG': '1='}\n", - "7046704 TO1000#1#chr03 3229 3230\n", - "\t 64687337 64687338\n", - "{'Q.START': 3211, 'Q.END': 3229, 'T.START': 64687318, 'T.END': 64687336, 'CG': '18='}\n", - "7046706 TO1000#1#chr03 3230 3276\n", - "\t 64687339 64687385\n", - "{'Q.START': 3229, 'Q.END': 3230, 'T.START': 64687337, 'T.END': 64687338, 'CG': '1='}\n", - "7046707 TO1000#1#chr03 3276 3277\n", - "\t 64687386 64687387\n", - "{'Q.START': 3230, 'Q.END': 3276, 'T.START': 64687339, 'T.END': 64687385, 'CG': '46='}\n", - "7046709 TO1000#1#chr03 3277 3315\n", - "\t 64687388 64687426\n", - "{'Q.START': 3276, 'Q.END': 3277, 'T.START': 64687386, 'T.END': 64687387, 'CG': '1='}\n", - "7046710 TO1000#1#chr03 3315 3316\n", - "\t 64687427 64687428\n", - "{'Q.START': 3277, 'Q.END': 3315, 'T.START': 64687388, 'T.END': 64687426, 'CG': '38='}\n", - "7046712 TO1000#1#chr03 3316 3322\n", - "\t 64687429 64687435\n", - "{'Q.START': 3315, 'Q.END': 3316, 'T.START': 64687427, 'T.END': 64687428, 'CG': '1='}\n", - "7046713 TO1000#1#chr03 3322 3323\n", - "\t 64687436 64687437\n", - "{'Q.START': 3316, 'Q.END': 3322, 'T.START': 64687429, 'T.END': 64687435, 'CG': '6='}\n", - "7046715 TO1000#1#chr03 3323 3348\n", - "\t 64687438 64687463\n", - "{'Q.START': 3322, 'Q.END': 3323, 'T.START': 64687436, 'T.END': 64687437, 'CG': '1='}\n", - "7046718 TO1000#1#chr03 3352 3353\n", - "\t 64687464 64687465\n", - "{'Q.START': 3323, 'Q.END': 3348, 'T.START': 64687438, 'T.END': 64687463, 'CG': '25='}\n", - "7046717 TO1000#1#chr03 3351 3352\n", - "\t 64687466 64687467\n", - "{'Q.START': 3352, 'Q.END': 3353, 'T.START': 64687464, 'T.END': 64687465, 'CG': '1='}\n", - "7046718 TO1000#1#chr03 3352 3353\n", - "\t 64687464 64687465\n", - "{'Q.START': 3351, 'Q.END': 3352, 'T.START': 64687466, 'T.END': 64687467, 'CG': '1='}\n", - "7046717 TO1000#1#chr03 3351 3352\n", - "\t 64687466 64687467\n", - "{'Q.START': 3352, 'Q.END': 3353, 'T.START': 64687464, 'T.END': 64687465, 'CG': '1='}\n", - "7046718 TO1000#1#chr03 3352 3353\n", - "\t 64687464 64687465\n", - "{'Q.START': 3351, 'Q.END': 3352, 'T.START': 64687466, 'T.END': 64687467, 'CG': '1='}\n", - "7046720 TO1000#1#chr03 3353 3354\n", - "\t 64687468 64687469\n", - "{'Q.START': 3352, 'Q.END': 3353, 'T.START': 64687464, 'T.END': 64687465, 'CG': '1='}\n", - "7046722 TO1000#1#chr03 3354 3356\n", - "\t 64687470 64687472\n", - "{'Q.START': 3353, 'Q.END': 3354, 'T.START': 64687468, 'T.END': 64687469, 'CG': '1='}\n", - "7046724 TO1000#1#chr03 3356 3357\n", - "\t 64687473 64687474\n", - "{'Q.START': 3354, 'Q.END': 3356, 'T.START': 64687470, 'T.END': 64687472, 'CG': '2='}\n", - "7046725 TO1000#1#chr03 3357 3489\n", - "\t 64687475 64687607\n", - "{'Q.START': 3356, 'Q.END': 3357, 'T.START': 64687473, 'T.END': 64687474, 'CG': '1='}\n", - "7046727 TO1000#1#chr03 3489 3490\n", - "\t 64687608 64687609\n", - "{'Q.START': 3357, 'Q.END': 3489, 'T.START': 64687475, 'T.END': 64687607, 'CG': '132='}\n", - "7046728 TO1000#1#chr03 3490 3642\n", - "\t 64687610 64687762\n", - "{'Q.START': 3489, 'Q.END': 3490, 'T.START': 64687608, 'T.END': 64687609, 'CG': '1='}\n", - "7046729 TO1000#1#chr03 3642 3644\n", - "\t 64687763 64687765\n", - "{'Q.START': 3490, 'Q.END': 3642, 'T.START': 64687610, 'T.END': 64687762, 'CG': '152='}\n", - "7046730 TO1000#1#chr03 3644 3685\n", - "\t 64687766 64687807\n", - "{'Q.START': 3642, 'Q.END': 3644, 'T.START': 64687763, 'T.END': 64687765, 'CG': '2='}\n", - "7046731 TO1000#1#chr03 3685 3687\n", - "\t 64687808 64687810\n", - "{'Q.START': 3644, 'Q.END': 3685, 'T.START': 64687766, 'T.END': 64687807, 'CG': '41='}\n", - "7046733 TO1000#1#chr03 3687 3693\n", - "\t 64687811 64687817\n", - "{'Q.START': 3685, 'Q.END': 3687, 'T.START': 64687808, 'T.END': 64687810, 'CG': '2='}\n", - "7046735 TO1000#1#chr03 3693 3694\n", - "\t 64687818 64687819\n", - "{'Q.START': 3687, 'Q.END': 3693, 'T.START': 64687811, 'T.END': 64687817, 'CG': '6='}\n", - "7046736 TO1000#1#chr03 3694 3708\n", - "\t 64687820 64687834\n", - "{'Q.START': 3693, 'Q.END': 3694, 'T.START': 64687818, 'T.END': 64687819, 'CG': '1='}\n", - "7046738 TO1000#1#chr03 3720 3721\n", - "\t 64687835 64687836\n", - "{'Q.START': 3694, 'Q.END': 3708, 'T.START': 64687820, 'T.END': 64687834, 'CG': '14='}\n", - "7046739 TO1000#1#chr03 3721 3722\n", - "\t 64687837 64687838\n", - "{'Q.START': 3720, 'Q.END': 3721, 'T.START': 64687835, 'T.END': 64687836, 'CG': '1='}\n", - "7046740 TO1000#1#chr03 3716 3720\n", - "\t 64687839 64687843\n", - "{'Q.START': 3721, 'Q.END': 3722, 'T.START': 64687837, 'T.END': 64687838, 'CG': '1='}\n", - "7046738 TO1000#1#chr03 3720 3721\n", - "\t 64687835 64687836\n", - "{'Q.START': 3716, 'Q.END': 3720, 'T.START': 64687839, 'T.END': 64687843, 'CG': '4='}\n", - "7046739 TO1000#1#chr03 3721 3722\n", - "\t 64687837 64687838\n", - "{'Q.START': 3720, 'Q.END': 3721, 'T.START': 64687835, 'T.END': 64687836, 'CG': '1='}\n", - "7046740 TO1000#1#chr03 3716 3720\n", - "\t 64687839 64687843\n", - "{'Q.START': 3721, 'Q.END': 3722, 'T.START': 64687837, 'T.END': 64687838, 'CG': '1='}\n", - "7046738 TO1000#1#chr03 3720 3721\n", - "\t 64687835 64687836\n", - "{'Q.START': 3716, 'Q.END': 3720, 'T.START': 64687839, 'T.END': 64687843, 'CG': '4='}\n", - "7046739 TO1000#1#chr03 3721 3722\n", - "\t 64687837 64687838\n", - "{'Q.START': 3720, 'Q.END': 3721, 'T.START': 64687835, 'T.END': 64687836, 'CG': '1='}\n", - "7046741 TO1000#1#chr03 3722 3735\n", - "\t 64687844 64687877\n", - "{'Q.START': 3721, 'Q.END': 3722, 'T.START': 64687837, 'T.END': 64687838, 'CG': '1='}\n", - "ALN_2\n", - "7594382 D134#1#chr03 0 1\n", - "\tIn path\n", - "\t 70220037 70220038\n", - "skipped\n", - "\n", - "7594369 D134#1#chr03 32 33\n", - "\tIn path\n", - "\t 70219216 70219217\n", - "{'Q.START': 0, 'Q.END': 1, 'T.START': 70220037, 'T.END': 70220038, 'CG': '1='}\n", - "7594371 D134#1#chr03 15 16\n", - "\tIn path\n", - "\t 70221163 70221164\n", - "{'Q.START': 32, 'Q.END': 33, 'T.START': 70219216, 'T.END': 70219217, 'CG': '1='}\n", - "7594021 D134#1#chr03 57 58\n", - "\tIn path\n", - "\t 70219218 70219219\n", - "{'Q.START': 15, 'Q.END': 16, 'T.START': 70221163, 'T.END': 70221164, 'CG': '1='}\n", - "7594286 D134#1#chr03 59 60\n", - "\tIn path\n", - "\t 70219349 70219350\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", - "7594374 D134#1#chr03 69 70\n", - "\tIn path\n", - "\t 70219092 70219093\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", - "7594356 D134#1#chr03 66 67\n", - "\tIn path\n", - "\t 70219570 70219571\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", - "7594374 D134#1#chr03 69 70\n", - "\tIn path\n", - "\t 70219092 70219093\n", - "{'Q.START': 66, 'Q.END': 67, 'T.START': 70219570, 'T.END': 70219571, 'CG': '1='}\n", - "7594374 D134#1#chr03 69 70\n", - "\tIn path\n", - "\t 70219092 70219093\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", - "7594375 D134#1#chr03 68 69\n", - "\tIn path\n", - "\t 70221598 70221599\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", - "7594626 D134#1#chr03 10 11\n", - "\tIn path\n", - "\t 70219214 70219215\n", - "{'Q.START': 68, 'Q.END': 69, 'T.START': 70221598, 'T.END': 70221599, 'CG': '1='}\n", - "7594011 D134#1#chr03 11 12\n", - "\tIn path\n", - "\t 70219995 70219996\n", - "{'Q.START': 10, 'Q.END': 11, 'T.START': 70219214, 'T.END': 70219215, 'CG': '1='}\n", - "7594374 D134#1#chr03 69 70\n", - "\tIn path\n", - "\t 70219092 70219093\n", - "{'Q.START': 11, 'Q.END': 12, 'T.START': 70219995, 'T.END': 70219996, 'CG': '1='}\n", - "7594375 D134#1#chr03 68 69\n", - "\tIn path\n", - "\t 70221598 70221599\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", - "7594369 D134#1#chr03 32 33\n", - "\tIn path\n", - "\t 70219216 70219217\n", - "{'Q.START': 68, 'Q.END': 69, 'T.START': 70221598, 'T.END': 70221599, 'CG': '1='}\n", - "7594371 D134#1#chr03 15 16\n", - "\tIn path\n", - "\t 70221163 70221164\n", - "{'Q.START': 32, 'Q.END': 33, 'T.START': 70219216, 'T.END': 70219217, 'CG': '1='}\n", - "7594021 D134#1#chr03 57 58\n", - "\tIn path\n", - "\t 70219218 70219219\n", - "{'Q.START': 15, 'Q.END': 16, 'T.START': 70221163, 'T.END': 70221164, 'CG': '1='}\n", - "7594021 D134#1#chr03 57 58\n", - "\tIn path\n", - "\t 70219218 70219219\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", - "7594021 D134#1#chr03 57 58\n", - "\tIn path\n", - "\t 70219218 70219219\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", - "7594021 D134#1#chr03 57 58\n", - "\tIn path\n", - "\t 70219218 70219219\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", - "7594241 D134#1#chr03 20 21\n", - "\tIn path\n", - "\t 70219220 70219221\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", - "7594248 D134#1#chr03 21 22\n", - "\tNot in path\n", - "7594286 D134#1#chr03 59 60\n", - "\tIn path\n", - "\t 70219349 70219350\n", - "{'Q.START': 20, 'Q.END': 21, 'T.START': 70219220, 'T.END': 70219221, 'CG': '1='}\n", - "7594311 D134#1#chr03 55 56\n", - "\tIn path\n", - "\t 70219351 70219352\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", - "7594315 D134#1#chr03 53 54\n", - "\tIn path\n", - "\t 70219857 70219858\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", - "7594311 D134#1#chr03 55 56\n", - "\tIn path\n", - "\t 70219351 70219352\n", - "{'Q.START': 53, 'Q.END': 54, 'T.START': 70219857, 'T.END': 70219858, 'CG': '1='}\n", - "7594330 D134#1#chr03 26 27\n", - "\tNot in path\n", - "7594311 D134#1#chr03 55 56\n", - "\tIn path\n", - "\t 70219351 70219352\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", - "7594315 D134#1#chr03 53 54\n", - "\tIn path\n", - "\t 70219857 70219858\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", - "7594374 D134#1#chr03 69 70\n", - "\tIn path\n", - "\t 70219092 70219093\n", - "{'Q.START': 53, 'Q.END': 54, 'T.START': 70219857, 'T.END': 70219858, 'CG': '1='}\n", - "7594311 D134#1#chr03 55 56\n", - "\tIn path\n", - "\t 70219351 70219352\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", - "7594374 D134#1#chr03 69 70\n", - "\tIn path\n", - "\t 70219092 70219093\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", - "7594369 D134#1#chr03 32 33\n", - "\tIn path\n", - "\t 70219216 70219217\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", - "7594021 D134#1#chr03 57 58\n", - "\tIn path\n", - "\t 70219218 70219219\n", - "{'Q.START': 32, 'Q.END': 33, 'T.START': 70219216, 'T.END': 70219217, 'CG': '1='}\n", - "7594026 D134#1#chr03 37 38\n", - "\tIn path\n", - "\t 70220249 70220250\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", - "7594021 D134#1#chr03 57 58\n", - "\tIn path\n", - "\t 70219218 70219219\n", - "{'Q.START': 37, 'Q.END': 38, 'T.START': 70220249, 'T.END': 70220250, 'CG': '1='}\n", - "7594021 D134#1#chr03 57 58\n", - "\tIn path\n", - "\t 70219218 70219219\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", - "7594026 D134#1#chr03 37 38\n", - "\tIn path\n", - "\t 70220249 70220250\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", - "7594021 D134#1#chr03 57 58\n", - "\tIn path\n", - "\t 70219218 70219219\n", - "{'Q.START': 37, 'Q.END': 38, 'T.START': 70220249, 'T.END': 70220250, 'CG': '1='}\n", - "7594021 D134#1#chr03 57 58\n", - "\tIn path\n", - "\t 70219218 70219219\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", - "7594021 D134#1#chr03 57 58\n", - "\tIn path\n", - "\t 70219218 70219219\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", - "7594021 D134#1#chr03 57 58\n", - "\tIn path\n", - "\t 70219218 70219219\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", - "7594286 D134#1#chr03 59 60\n", - "\tIn path\n", - "\t 70219349 70219350\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", - "7594374 D134#1#chr03 69 70\n", - "\tIn path\n", - "\t 70219092 70219093\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", - "7594021 D134#1#chr03 57 58\n", - "\tIn path\n", - "\t 70219218 70219219\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", - "7594286 D134#1#chr03 59 60\n", - "\tIn path\n", - "\t 70219349 70219350\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", - "7594311 D134#1#chr03 55 56\n", - "\tIn path\n", - "\t 70219351 70219352\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", - "7594286 D134#1#chr03 59 60\n", - "\tIn path\n", - "\t 70219349 70219350\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", - "7594311 D134#1#chr03 55 56\n", - "\tIn path\n", - "\t 70219351 70219352\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", - "7594286 D134#1#chr03 59 60\n", - "\tIn path\n", - "\t 70219349 70219350\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", - "7594311 D134#1#chr03 55 56\n", - "\tIn path\n", - "\t 70219351 70219352\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", - "7594286 D134#1#chr03 59 60\n", - "\tIn path\n", - "\t 70219349 70219350\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", - "7594311 D134#1#chr03 55 56\n", - "\tIn path\n", - "\t 70219351 70219352\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", - "7594315 D134#1#chr03 53 54\n", - "\tIn path\n", - "\t 70219857 70219858\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", - "7594286 D134#1#chr03 59 60\n", - "\tIn path\n", - "\t 70219349 70219350\n", - "{'Q.START': 53, 'Q.END': 54, 'T.START': 70219857, 'T.END': 70219858, 'CG': '1='}\n", - "7594311 D134#1#chr03 55 56\n", - "\tIn path\n", - "\t 70219351 70219352\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", - "7594374 D134#1#chr03 69 70\n", - "\tIn path\n", - "\t 70219092 70219093\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 70219351, 'T.END': 70219352, 'CG': '1='}\n", - "7594021 D134#1#chr03 57 58\n", - "\tIn path\n", - "\t 70219218 70219219\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", - "7594286 D134#1#chr03 59 60\n", - "\tIn path\n", - "\t 70219349 70219350\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 70219218, 'T.END': 70219219, 'CG': '1X'}\n", - "7594286 D134#1#chr03 59 60\n", - "\tIn path\n", - "\t 70219349 70219350\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", - "7594374 D134#1#chr03 69 70\n", - "\tIn path\n", - "\t 70219092 70219093\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 70219349, 'T.END': 70219350, 'CG': '1='}\n", - "7594356 D134#1#chr03 66 67\n", - "\tIn path\n", - "\t 70219570 70219571\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", - "7594374 D134#1#chr03 69 70\n", - "\tIn path\n", - "\t 70219092 70219093\n", - "{'Q.START': 66, 'Q.END': 67, 'T.START': 70219570, 'T.END': 70219571, 'CG': '1='}\n", - "7594374 D134#1#chr03 69 70\n", - "\tIn path\n", - "\t 70219092 70219093\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", - "7594375 D134#1#chr03 68 69\n", - "\tIn path\n", - "\t 70221598 70221599\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", - "7594374 D134#1#chr03 69 70\n", - "\tIn path\n", - "\t 70219092 70219093\n", - "{'Q.START': 68, 'Q.END': 69, 'T.START': 70221598, 'T.END': 70221599, 'CG': '1='}\n", - "7594356 D134#1#chr03 66 67\n", - "\tIn path\n", - "\t 70219570 70219571\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", - "7594374 D134#1#chr03 69 70\n", - "\tIn path\n", - "\t 70219092 70219093\n", - "{'Q.START': 66, 'Q.END': 67, 'T.START': 70219570, 'T.END': 70219571, 'CG': '1='}\n", - "7594375 D134#1#chr03 68 69\n", - "\tIn path\n", - "\t 70221598 70221599\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", - "7594374 D134#1#chr03 69 70\n", - "\tIn path\n", - "\t 70219092 70219093\n", - "{'Q.START': 68, 'Q.END': 69, 'T.START': 70221598, 'T.END': 70221599, 'CG': '1='}\n", - "7594350 D134#1#chr03 70 71\n", - "\tIn path\n", - "\t 70219226 70219227\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 70219092, 'T.END': 70219093, 'CG': '1='}\n", - "7594264 D134#1#chr03 71 72\n", - "\tIn path\n", - "\t 70219228 70219229\n", - "{'Q.START': 70, 'Q.END': 71, 'T.START': 70219226, 'T.END': 70219227, 'CG': '1='}\n", - "7594207 D134#1#chr03 72 73\n", - "\tIn path\n", - "\t 70219230 70219231\n", - "{'Q.START': 71, 'Q.END': 72, 'T.START': 70219228, 'T.END': 70219229, 'CG': '1='}\n", - "7594225 D134#1#chr03 73 74\n", - "\tIn path\n", - "\t 70219232 70219233\n", - "{'Q.START': 72, 'Q.END': 73, 'T.START': 70219230, 'T.END': 70219231, 'CG': '1='}\n", - "7594227 D134#1#chr03 74 75\n", - "\tIn path\n", - "\t 70220150 70220151\n", - "{'Q.START': 73, 'Q.END': 74, 'T.START': 70219232, 'T.END': 70219233, 'CG': '1='}\n", - "7594120 D134#1#chr03 75 76\n", - "\tIn path\n", - "\t 70219236 70219237\n", - "{'Q.START': 74, 'Q.END': 75, 'T.START': 70220150, 'T.END': 70220151, 'CG': '1='}\n", - "7594132 D134#1#chr03 76 77\n", - "\tIn path\n", - "\t 70219777 70219778\n", - "{'Q.START': 75, 'Q.END': 76, 'T.START': 70219236, 'T.END': 70219237, 'CG': '1='}\n", - "7594165 D134#1#chr03 77 78\n", - "\tIn path\n", - "\t 70219240 70219241\n", - "{'Q.START': 76, 'Q.END': 77, 'T.START': 70219777, 'T.END': 70219778, 'CG': '1='}\n", - "7594172 D134#1#chr03 78 3735\n", - "\tNot in path\n", - "ALN_2\n", - "7594382 TO1000#1#chr03 0 1\n", - "\t 61731222 61731223\n", - "skipped\n", - "\n", - "7594369 TO1000#1#chr03 32 33\n", - "\t 61731060 61731061\n", - "{'Q.START': 0, 'Q.END': 1, 'T.START': 61731222, 'T.END': 61731223, 'CG': '1='}\n", - "7594371 TO1000#1#chr03 15 16\n", - "\tNot in path\n", - "7594021 TO1000#1#chr03 57 58\n", - "\t 61730922 61730923\n", - "{'Q.START': 32, 'Q.END': 33, 'T.START': 61731060, 'T.END': 61731061, 'CG': '1='}\n", - "7594286 TO1000#1#chr03 59 60\n", - "\t 61731054 61731055\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", - "7594374 TO1000#1#chr03 69 70\n", - "\t 61730920 61730921\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", - "7594356 TO1000#1#chr03 66 67\n", - "\t 61731519 61731520\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", - "7594374 TO1000#1#chr03 69 70\n", - "\t 61730920 61730921\n", - "{'Q.START': 66, 'Q.END': 67, 'T.START': 61731519, 'T.END': 61731520, 'CG': '1='}\n", - "7594374 TO1000#1#chr03 69 70\n", - "\t 61730920 61730921\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", - "7594375 TO1000#1#chr03 68 69\n", - "\t 61733612 61733613\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", - "7594626 TO1000#1#chr03 10 11\n", - "\t 61731056 61731057\n", - "{'Q.START': 68, 'Q.END': 69, 'T.START': 61733612, 'T.END': 61733613, 'CG': '1='}\n", - "7594011 TO1000#1#chr03 11 12\n", - "\t 61733900 61733901\n", - "{'Q.START': 10, 'Q.END': 11, 'T.START': 61731056, 'T.END': 61731057, 'CG': '1='}\n", - "7594374 TO1000#1#chr03 69 70\n", - "\t 61730920 61730921\n", - "{'Q.START': 11, 'Q.END': 12, 'T.START': 61733900, 'T.END': 61733901, 'CG': '1='}\n", - "7594375 TO1000#1#chr03 68 69\n", - "\t 61733612 61733613\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", - "7594369 TO1000#1#chr03 32 33\n", - "\t 61731060 61731061\n", - "{'Q.START': 68, 'Q.END': 69, 'T.START': 61733612, 'T.END': 61733613, 'CG': '1='}\n", - "7594371 TO1000#1#chr03 15 16\n", - "\tNot in path\n", - "7594021 TO1000#1#chr03 57 58\n", - "\t 61730922 61730923\n", - "{'Q.START': 32, 'Q.END': 33, 'T.START': 61731060, 'T.END': 61731061, 'CG': '1='}\n", - "7594021 TO1000#1#chr03 57 58\n", - "\t 61730922 61730923\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", - "7594021 TO1000#1#chr03 57 58\n", - "\t 61730922 61730923\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", - "7594021 TO1000#1#chr03 57 58\n", - "\t 61730922 61730923\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", - "7594241 TO1000#1#chr03 20 21\n", - "\t 61731046 61731047\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", - "7594248 TO1000#1#chr03 21 22\n", - "\t 61734261 61734262\n", - "{'Q.START': 20, 'Q.END': 21, 'T.START': 61731046, 'T.END': 61731047, 'CG': '1='}\n", - "7594286 TO1000#1#chr03 59 60\n", - "\t 61731054 61731055\n", - "{'Q.START': 21, 'Q.END': 22, 'T.START': 61734261, 'T.END': 61734262, 'CG': '1='}\n", - "7594311 TO1000#1#chr03 55 56\n", - "\t 61731052 61731053\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", - "7594315 TO1000#1#chr03 53 54\n", - "\t 61733937 61733938\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", - "7594311 TO1000#1#chr03 55 56\n", - "\t 61731052 61731053\n", - "{'Q.START': 53, 'Q.END': 54, 'T.START': 61733937, 'T.END': 61733938, 'CG': '1='}\n", - "7594330 TO1000#1#chr03 26 27\n", - "\t 61731768 61731769\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", - "7594311 TO1000#1#chr03 55 56\n", - "\t 61731052 61731053\n", - "{'Q.START': 26, 'Q.END': 27, 'T.START': 61731768, 'T.END': 61731769, 'CG': '1='}\n", - "7594315 TO1000#1#chr03 53 54\n", - "\t 61733937 61733938\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", - "7594374 TO1000#1#chr03 69 70\n", - "\t 61730920 61730921\n", - "{'Q.START': 53, 'Q.END': 54, 'T.START': 61733937, 'T.END': 61733938, 'CG': '1='}\n", - "7594311 TO1000#1#chr03 55 56\n", - "\t 61731052 61731053\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", - "7594374 TO1000#1#chr03 69 70\n", - "\t 61730920 61730921\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", - "7594369 TO1000#1#chr03 32 33\n", - "\t 61731060 61731061\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", - "7594021 TO1000#1#chr03 57 58\n", - "\t 61730922 61730923\n", - "{'Q.START': 32, 'Q.END': 33, 'T.START': 61731060, 'T.END': 61731061, 'CG': '1='}\n", - "7594026 TO1000#1#chr03 37 38\n", - "\t 61734267 61734268\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", - "7594021 TO1000#1#chr03 57 58\n", - "\t 61730922 61730923\n", - "{'Q.START': 37, 'Q.END': 38, 'T.START': 61734267, 'T.END': 61734268, 'CG': '1='}\n", - "7594021 TO1000#1#chr03 57 58\n", - "\t 61730922 61730923\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", - "7594026 TO1000#1#chr03 37 38\n", - "\t 61734267 61734268\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", - "7594021 TO1000#1#chr03 57 58\n", - "\t 61730922 61730923\n", - "{'Q.START': 37, 'Q.END': 38, 'T.START': 61734267, 'T.END': 61734268, 'CG': '1='}\n", - "7594021 TO1000#1#chr03 57 58\n", - "\t 61730922 61730923\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", - "7594021 TO1000#1#chr03 57 58\n", - "\t 61730922 61730923\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", - "7594021 TO1000#1#chr03 57 58\n", - "\t 61730922 61730923\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", - "7594286 TO1000#1#chr03 59 60\n", - "\t 61731054 61731055\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", - "7594374 TO1000#1#chr03 69 70\n", - "\t 61730920 61730921\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", - "7594021 TO1000#1#chr03 57 58\n", - "\t 61730922 61730923\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", - "7594286 TO1000#1#chr03 59 60\n", - "\t 61731054 61731055\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", - "7594311 TO1000#1#chr03 55 56\n", - "\t 61731052 61731053\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", - "7594286 TO1000#1#chr03 59 60\n", - "\t 61731054 61731055\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", - "7594311 TO1000#1#chr03 55 56\n", - "\t 61731052 61731053\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", - "7594286 TO1000#1#chr03 59 60\n", - "\t 61731054 61731055\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", - "7594311 TO1000#1#chr03 55 56\n", - "\t 61731052 61731053\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", - "7594286 TO1000#1#chr03 59 60\n", - "\t 61731054 61731055\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", - "7594311 TO1000#1#chr03 55 56\n", - "\t 61731052 61731053\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", - "7594315 TO1000#1#chr03 53 54\n", - "\t 61733937 61733938\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", - "7594286 TO1000#1#chr03 59 60\n", - "\t 61731054 61731055\n", - "{'Q.START': 53, 'Q.END': 54, 'T.START': 61733937, 'T.END': 61733938, 'CG': '1='}\n", - "7594311 TO1000#1#chr03 55 56\n", - "\t 61731052 61731053\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", - "7594374 TO1000#1#chr03 69 70\n", - "\t 61730920 61730921\n", - "{'Q.START': 55, 'Q.END': 56, 'T.START': 61731052, 'T.END': 61731053, 'CG': '1='}\n", - "7594021 TO1000#1#chr03 57 58\n", - "\t 61730922 61730923\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", - "7594286 TO1000#1#chr03 59 60\n", - "\t 61731054 61731055\n", - "{'Q.START': 57, 'Q.END': 58, 'T.START': 61730922, 'T.END': 61730923, 'CG': '1X'}\n", - "7594286 TO1000#1#chr03 59 60\n", - "\t 61731054 61731055\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", - "7594374 TO1000#1#chr03 69 70\n", - "\t 61730920 61730921\n", - "{'Q.START': 59, 'Q.END': 60, 'T.START': 61731054, 'T.END': 61731055, 'CG': '1='}\n", - "7594356 TO1000#1#chr03 66 67\n", - "\t 61731519 61731520\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", - "7594374 TO1000#1#chr03 69 70\n", - "\t 61730920 61730921\n", - "{'Q.START': 66, 'Q.END': 67, 'T.START': 61731519, 'T.END': 61731520, 'CG': '1='}\n", - "7594374 TO1000#1#chr03 69 70\n", - "\t 61730920 61730921\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", - "7594375 TO1000#1#chr03 68 69\n", - "\t 61733612 61733613\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", - "7594374 TO1000#1#chr03 69 70\n", - "\t 61730920 61730921\n", - "{'Q.START': 68, 'Q.END': 69, 'T.START': 61733612, 'T.END': 61733613, 'CG': '1='}\n", - "7594356 TO1000#1#chr03 66 67\n", - "\t 61731519 61731520\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", - "7594374 TO1000#1#chr03 69 70\n", - "\t 61730920 61730921\n", - "{'Q.START': 66, 'Q.END': 67, 'T.START': 61731519, 'T.END': 61731520, 'CG': '1='}\n", - "7594375 TO1000#1#chr03 68 69\n", - "\t 61733612 61733613\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", - "7594374 TO1000#1#chr03 69 70\n", - "\t 61730920 61730921\n", - "{'Q.START': 68, 'Q.END': 69, 'T.START': 61733612, 'T.END': 61733613, 'CG': '1='}\n", - "7594350 TO1000#1#chr03 70 71\n", - "\t 61731066 61731067\n", - "{'Q.START': 69, 'Q.END': 70, 'T.START': 61730920, 'T.END': 61730921, 'CG': '1='}\n", - "7594264 TO1000#1#chr03 71 72\n", - "\t 61731068 61731069\n", - "{'Q.START': 70, 'Q.END': 71, 'T.START': 61731066, 'T.END': 61731067, 'CG': '1='}\n", - "7594207 TO1000#1#chr03 72 73\n", - "\t 61731070 61731071\n", - "{'Q.START': 71, 'Q.END': 72, 'T.START': 61731068, 'T.END': 61731069, 'CG': '1='}\n", - "7594225 TO1000#1#chr03 73 74\n", - "\t 61731072 61731073\n", - "{'Q.START': 72, 'Q.END': 73, 'T.START': 61731070, 'T.END': 61731071, 'CG': '1='}\n", - "7594227 TO1000#1#chr03 74 75\n", - "\tNot in path\n", - "7594120 TO1000#1#chr03 75 76\n", - "\t 61731076 61731077\n", - "{'Q.START': 73, 'Q.END': 74, 'T.START': 61731072, 'T.END': 61731073, 'CG': '1='}\n", - "7594132 TO1000#1#chr03 76 77\n", - "\t 61733800 61733801\n", - "{'Q.START': 75, 'Q.END': 76, 'T.START': 61731076, 'T.END': 61731077, 'CG': '1='}\n", - "7594165 TO1000#1#chr03 77 78\n", - "\t 61731080 61731081\n", - "{'Q.START': 76, 'Q.END': 77, 'T.START': 61733800, 'T.END': 61733801, 'CG': '1='}\n", - "7594172 TO1000#1#chr03 78 3735\n", - "\tNot in path\n" - ] - } - ], - "source": [ - "ALNS = {}\n", - "## Iterating over alignments\n", - "for aln_name in aln_dict.keys():\n", - " \n", - " ## Iterating over paths of the gfa\n", - " for path_name in paths.keys():\n", - " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(aln_name)\n", - " _ = [] # Temporary list holding alignment blocks\n", - "\n", - " ## Iterating over alignment nodes of the current alignment\n", - " for node_id, orient in aln_dict[aln_name][\"PATH.MATCH\"]:\n", - "\n", - " # Getting node info\n", - " n_info = nodes[node_id]\n", - " q_start = n_info[aln_name][\"START\"] # Start position on the query\n", - " q_end = n_info[aln_name][\"END\"] # End position on the query\n", - " _CG = n_info[aln_name][\"CIGAR\"] # Cigar of the alignment on the current node\n", - "\n", - " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(node_id, path_name, q_start, q_end)\n", - "\n", - " ## Checking if path is traversing the current node\n", - " if path_name in list(n_info.keys()):\n", - " if path_name == \"D134#1#chr03\": print(\"\\tIn path\")\n", - "\n", - " ## Getting start and end position on the target given the orientation of the node in the alignment and the path\n", - " if n_info[aln_name][\"STRAND\"] == n_info[path_name][\"STRAND\"] :\n", - " t_start = n_info[path_name][\"START\"]+n_info[aln_name][\"S.OFF\"]\n", - " t_end = n_info[path_name][\"END\"]+n_info[aln_name][\"E.OFF\"] \n", - " else :\n", - " t_end = n_info[path_name][\"START\"]+n_info[aln_name][\"S.OFF\"]\n", - " t_start = n_info[path_name][\"END\"]+n_info[aln_name][\"E.OFF\"]\n", - "\n", - " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(\"\\t\", t_start, t_end)\n", - "\n", - " \"\"\"\n", - " If the latest block t.end and q.end matches with the current node t.start and q.start, \n", - " the node should be added to the block. Else, we terminate the block and add the node to a new block\n", - " \"\"\"\n", - " \n", - " # Non empty temporary list of aln and ending of the last block is the same as the start of the new node : \n", - " if len(_) and _[-1][\"T.END\"] == t_start and _[-1][\"Q.END\"]+1 == q_start: \n", - " tmp_aln[\"Q.END\"] = q_end\n", - " tmp_aln[\"T.END\"] = t_end\n", - " tmp_aln[\"CG\"] += _CG\n", - "# elif len(_) and _[-1][\"T.END\"] == t_start: # Following on the target not on the query (i.e. Insertion)\n", - "# tmp_aln[\"T.END\"] = t_end\n", - "# tmp_aln[\"CG\"] += f\"{nodes_length[node_id]}I\"\n", - "# elif len(_) and _[-1][\"Q.END\"]+1 == q_start: # Following on the query, not on the target (i.e. Deletion)\n", - "# tmp_aln[\"Q.END\"] = q_end\n", - "# tmp_aln[\"CG\"] += f\"{nodes_length[node_id]}D\"\n", - " else : # Else, completely different\n", - " try : \n", - " _.append(tmp_aln)\n", - " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(tmp_aln)\n", - " except : \n", - " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(\"skipped\\n\")\n", - " tmp_aln = {\n", - " \"Q.START\": q_start,\n", - " \"Q.END\": q_end,\n", - " \"T.START\": t_start,\n", - " \"T.END\": t_end,\n", - " \"CG\": _CG,\n", - " }\n", - " \n", - " else : \n", - " if path_name in [\"TO1000#1#chr03\", \"D134#1#chr03\"]: print(\"\\tNot in path\")\n", - " # Node is not in the path\n", - "\n", - " del tmp_aln\n", - " \n", - " ALNS[(path_name, aln_name)] = _" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "547f03fa-cbd5-42f9-b668-1ca4404795ba", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'Q.START': 0, 'Q.END': 77, 'T.START': 73306158, 'T.END': 73306235, 'CG': '77='}, {'Q.START': 77, 'Q.END': 82, 'T.START': 73306238, 'T.END': 73306243, 'CG': '5='}, {'Q.START': 83, 'Q.END': 138, 'T.START': 73306246, 'T.END': 73306301, 'CG': '55='}, {'Q.START': 139, 'Q.END': 202, 'T.START': 73306302, 'T.END': 73306365, 'CG': '63='}, {'Q.START': 202, 'Q.END': 203, 'T.START': 73306366, 'T.END': 73306367, 'CG': '1='}, {'Q.START': 203, 'Q.END': 379, 'T.START': 73306368, 'T.END': 73306544, 'CG': '176='}, {'Q.START': 379, 'Q.END': 380, 'T.START': 73306545, 'T.END': 73306546, 'CG': '1='}, {'Q.START': 380, 'Q.END': 429, 'T.START': 73306547, 'T.END': 73306596, 'CG': '49='}, {'Q.START': 429, 'Q.END': 430, 'T.START': 73306597, 'T.END': 73306598, 'CG': '1='}, {'Q.START': 430, 'Q.END': 457, 'T.START': 73306599, 'T.END': 73306626, 'CG': '27='}, {'Q.START': 457, 'Q.END': 492, 'T.START': 73306641, 'T.END': 73306676, 'CG': '35='}, {'Q.START': 508, 'Q.END': 564, 'T.START': 73306694, 'T.END': 73306750, 'CG': '56='}, {'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}, {'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}, {'Q.START': 568, 'Q.END': 569, 'T.START': 73306753, 'T.END': 73306754, 'CG': '1='}, {'Q.START': 569, 'Q.END': 824, 'T.START': 73306755, 'T.END': 73307010, 'CG': '255='}, {'Q.START': 826, 'Q.END': 858, 'T.START': 73307011, 'T.END': 73307043, 'CG': '32='}, {'Q.START': 858, 'Q.END': 859, 'T.START': 73307044, 'T.END': 73307045, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 868, 'Q.END': 869, 'T.START': 73307046, 'T.END': 73307047, 'CG': '1='}, {'Q.START': 869, 'Q.END': 913, 'T.START': 73307048, 'T.END': 73307092, 'CG': '44='}, {'Q.START': 913, 'Q.END': 919, 'T.START': 73307093, 'T.END': 73307099, 'CG': '6='}, {'Q.START': 919, 'Q.END': 978, 'T.START': 73307100, 'T.END': 73307159, 'CG': '59='}, {'Q.START': 978, 'Q.END': 979, 'T.START': 73307160, 'T.END': 73307161, 'CG': '1='}, {'Q.START': 979, 'Q.END': 1038, 'T.START': 73307162, 'T.END': 73307221, 'CG': '59='}, {'Q.START': 1038, 'Q.END': 1045, 'T.START': 73307224, 'T.END': 73307231, 'CG': '7='}, {'Q.START': 1045, 'Q.END': 1046, 'T.START': 73307232, 'T.END': 73307233, 'CG': '1='}, {'Q.START': 1046, 'Q.END': 1080, 'T.START': 73307234, 'T.END': 73307268, 'CG': '34='}, {'Q.START': 1080, 'Q.END': 1081, 'T.START': 73307269, 'T.END': 73307270, 'CG': '1='}, {'Q.START': 1081, 'Q.END': 1107, 'T.START': 73307271, 'T.END': 73307297, 'CG': '26='}, {'Q.START': 1108, 'Q.END': 1183, 'T.START': 73307300, 'T.END': 73307375, 'CG': '75='}, {'Q.START': 1183, 'Q.END': 1186, 'T.START': 73307376, 'T.END': 73307379, 'CG': '3='}, {'Q.START': 1224, 'Q.END': 1257, 'T.START': 73307419, 'T.END': 73307452, 'CG': '33='}, {'Q.START': 1289, 'Q.END': 1311, 'T.START': 73307475, 'T.END': 73307497, 'CG': '22='}, {'Q.START': 1359, 'Q.END': 1382, 'T.START': 73307546, 'T.END': 73307569, 'CG': '23='}, {'Q.START': 1434, 'Q.END': 1451, 'T.START': 73307643, 'T.END': 73307660, 'CG': '17='}, {'Q.START': 1451, 'Q.END': 1531, 'T.START': 73307661, 'T.END': 73307741, 'CG': '80='}, {'Q.START': 1532, 'Q.END': 1543, 'T.START': 73307744, 'T.END': 73307755, 'CG': '11='}, {'Q.START': 1544, 'Q.END': 1572, 'T.START': 73307758, 'T.END': 73307786, 'CG': '28='}, {'Q.START': 1572, 'Q.END': 1573, 'T.START': 73307787, 'T.END': 73307788, 'CG': '1='}, {'Q.START': 1573, 'Q.END': 1587, 'T.START': 73307789, 'T.END': 73307803, 'CG': '14='}, {'Q.START': 1588, 'Q.END': 1616, 'T.START': 73307806, 'T.END': 73307834, 'CG': '28='}, {'Q.START': 1616, 'Q.END': 1617, 'T.START': 73307835, 'T.END': 73307836, 'CG': '1='}, {'Q.START': 1617, 'Q.END': 1646, 'T.START': 73307837, 'T.END': 73307866, 'CG': '29='}, {'Q.START': 1646, 'Q.END': 1661, 'T.START': 73307867, 'T.END': 73307882, 'CG': '15='}, {'Q.START': 1661, 'Q.END': 1673, 'T.START': 73307883, 'T.END': 73307895, 'CG': '12='}, {'Q.START': 1673, 'Q.END': 1674, 'T.START': 73307896, 'T.END': 73307897, 'CG': '1='}, {'Q.START': 1674, 'Q.END': 1726, 'T.START': 73307898, 'T.END': 73307950, 'CG': '52='}, {'Q.START': 1727, 'Q.END': 1762, 'T.START': 73307953, 'T.END': 73307988, 'CG': '35='}, {'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}, {'Q.START': 1765, 'Q.END': 1766, 'T.START': 73307993, 'T.END': 73307994, 'CG': '1='}, {'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}, {'Q.START': 1765, 'Q.END': 1766, 'T.START': 73307993, 'T.END': 73307994, 'CG': '1='}, {'Q.START': 1766, 'Q.END': 1767, 'T.START': 73307991, 'T.END': 73307992, 'CG': '1='}, {'Q.START': 1767, 'Q.END': 1824, 'T.START': 73307995, 'T.END': 73308052, 'CG': '57='}, {'Q.START': 1824, 'Q.END': 1825, 'T.START': 73308053, 'T.END': 73308054, 'CG': '1='}, {'Q.START': 1825, 'Q.END': 1975, 'T.START': 73308055, 'T.END': 73308205, 'CG': '150='}, {'Q.START': 1976, 'Q.END': 2015, 'T.START': 73308208, 'T.END': 73308247, 'CG': '39='}, {'Q.START': 2016, 'Q.END': 2047, 'T.START': 73308250, 'T.END': 73308281, 'CG': '31='}, {'Q.START': 2047, 'Q.END': 2055, 'T.START': 73308286, 'T.END': 73308294, 'CG': '8='}, {'Q.START': 2056, 'Q.END': 2120, 'T.START': 73308297, 'T.END': 73308361, 'CG': '64='}, {'Q.START': 2120, 'Q.END': 2121, 'T.START': 73308362, 'T.END': 73308363, 'CG': '1='}, {'Q.START': 2121, 'Q.END': 2157, 'T.START': 73308364, 'T.END': 73308400, 'CG': '36='}, {'Q.START': 2158, 'Q.END': 2170, 'T.START': 73308403, 'T.END': 73308415, 'CG': '12='}, {'Q.START': 2170, 'Q.END': 2171, 'T.START': 73308416, 'T.END': 73308417, 'CG': '1='}, {'Q.START': 2171, 'Q.END': 2205, 'T.START': 73308418, 'T.END': 73308452, 'CG': '34='}, {'Q.START': 2206, 'Q.END': 2344, 'T.START': 73308455, 'T.END': 73308593, 'CG': '138='}, {'Q.START': 2345, 'Q.END': 2364, 'T.START': 73308596, 'T.END': 73308615, 'CG': '19='}, {'Q.START': 2364, 'Q.END': 2383, 'T.START': 73308616, 'T.END': 73308635, 'CG': '19='}, {'Q.START': 2383, 'Q.END': 2408, 'T.START': 73308636, 'T.END': 73308661, 'CG': '25='}, {'Q.START': 2408, 'Q.END': 2409, 'T.START': 73308662, 'T.END': 73308663, 'CG': '1='}, {'Q.START': 2409, 'Q.END': 2441, 'T.START': 73308664, 'T.END': 73308696, 'CG': '32='}, {'Q.START': 2441, 'Q.END': 2442, 'T.START': 73308697, 'T.END': 73308698, 'CG': '1='}, {'Q.START': 2442, 'Q.END': 2580, 'T.START': 73308699, 'T.END': 73308837, 'CG': '138='}, {'Q.START': 2582, 'Q.END': 2583, 'T.START': 73308838, 'T.END': 73308839, 'CG': '1='}, {'Q.START': 2583, 'Q.END': 2584, 'T.START': 73308840, 'T.END': 73308841, 'CG': '1='}, {'Q.START': 2582, 'Q.END': 2583, 'T.START': 73308838, 'T.END': 73308839, 'CG': '1='}, {'Q.START': 2583, 'Q.END': 2584, 'T.START': 73308840, 'T.END': 73308841, 'CG': '1='}, {'Q.START': 2584, 'Q.END': 2764, 'T.START': 73308842, 'T.END': 73309022, 'CG': '180='}, {'Q.START': 2765, 'Q.END': 2797, 'T.START': 73309025, 'T.END': 73309057, 'CG': '32='}, {'Q.START': 2798, 'Q.END': 2878, 'T.START': 73309060, 'T.END': 73309140, 'CG': '80='}, {'Q.START': 2878, 'Q.END': 2879, 'T.START': 73309141, 'T.END': 73309142, 'CG': '1='}, {'Q.START': 2879, 'Q.END': 2951, 'T.START': 73309143, 'T.END': 73309215, 'CG': '72='}, {'Q.START': 2951, 'Q.END': 2952, 'T.START': 73309216, 'T.END': 73309217, 'CG': '1='}, {'Q.START': 2952, 'Q.END': 3002, 'T.START': 73309218, 'T.END': 73309268, 'CG': '50='}, {'Q.START': 3002, 'Q.END': 3077, 'T.START': 73309271, 'T.END': 73309346, 'CG': '75='}, {'Q.START': 3077, 'Q.END': 3078, 'T.START': 73309347, 'T.END': 73309348, 'CG': '1='}, {'Q.START': 3078, 'Q.END': 3093, 'T.START': 73309349, 'T.END': 73309364, 'CG': '15='}, {'Q.START': 3094, 'Q.END': 3097, 'T.START': 73309367, 'T.END': 73309370, 'CG': '3='}, {'Q.START': 3097, 'Q.END': 3140, 'T.START': 73309371, 'T.END': 73309414, 'CG': '43='}, {'Q.START': 3140, 'Q.END': 3210, 'T.START': 73309415, 'T.END': 73309485, 'CG': '70='}, {'Q.START': 3210, 'Q.END': 3211, 'T.START': 73309486, 'T.END': 73309487, 'CG': '1='}, {'Q.START': 3211, 'Q.END': 3229, 'T.START': 73309488, 'T.END': 73309506, 'CG': '18='}, {'Q.START': 3229, 'Q.END': 3230, 'T.START': 73309507, 'T.END': 73309508, 'CG': '1='}, {'Q.START': 3230, 'Q.END': 3276, 'T.START': 73309509, 'T.END': 73309555, 'CG': '46='}, {'Q.START': 3277, 'Q.END': 3315, 'T.START': 73309558, 'T.END': 73309596, 'CG': '38='}, {'Q.START': 3316, 'Q.END': 3322, 'T.START': 73309599, 'T.END': 73309605, 'CG': '6='}, {'Q.START': 3323, 'Q.END': 3348, 'T.START': 73309608, 'T.END': 73309633, 'CG': '25='}, {'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}, {'Q.START': 3351, 'Q.END': 3352, 'T.START': 73309636, 'T.END': 73309637, 'CG': '1='}, {'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}, {'Q.START': 3351, 'Q.END': 3352, 'T.START': 73309636, 'T.END': 73309637, 'CG': '1='}, {'Q.START': 3352, 'Q.END': 3353, 'T.START': 73309634, 'T.END': 73309635, 'CG': '1='}, {'Q.START': 3353, 'Q.END': 3354, 'T.START': 73309638, 'T.END': 73309639, 'CG': '1='}, {'Q.START': 3354, 'Q.END': 3356, 'T.START': 73309640, 'T.END': 73309642, 'CG': '2='}, {'Q.START': 3357, 'Q.END': 3489, 'T.START': 73309645, 'T.END': 73309777, 'CG': '132='}, {'Q.START': 3490, 'Q.END': 3642, 'T.START': 73309780, 'T.END': 73309932, 'CG': '152='}, {'Q.START': 3644, 'Q.END': 3685, 'T.START': 73309933, 'T.END': 73309974, 'CG': '41='}, {'Q.START': 3687, 'Q.END': 3693, 'T.START': 73309977, 'T.END': 73309983, 'CG': '6='}, {'Q.START': 3694, 'Q.END': 3708, 'T.START': 73309986, 'T.END': 73310000, 'CG': '14='}, {'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}, {'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}, {'Q.START': 3716, 'Q.END': 3720, 'T.START': 73310005, 'T.END': 73310009, 'CG': '4='}, {'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}, {'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}, {'Q.START': 3716, 'Q.END': 3720, 'T.START': 73310005, 'T.END': 73310009, 'CG': '4='}, {'Q.START': 3720, 'Q.END': 3721, 'T.START': 73310010, 'T.END': 73310011, 'CG': '1='}, {'Q.START': 3721, 'Q.END': 3722, 'T.START': 73310003, 'T.END': 73310004, 'CG': '1='}]\n" - ] - } - ], - "source": [ - "print(ALNS[(\"D134#1#chr03\", \"ALN_1\")])" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/gaf2aln.py b/gaf2aln.py deleted file mode 100644 index 25fdebf..0000000 --- a/gaf2aln.py +++ /dev/null @@ -1,441 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -gaf2aln -Convert gaf alignement to sam or paf - -@author: alexis.mergez@inrae.fr -@version: 0.1 -""" - -import numpy as np -import pandas as pd -import argparse -import concurrent.futures -import os -import re - -version = "0.1" - -## Argument parser -arg_parser = argparse.ArgumentParser(description='GFAvc: GFA version converter') -arg_parser.add_argument( - "--gfa", - "-g", - dest = "gfa", - required = True, - help = "Graph (.gfa v1)" - ) -arg_parser.add_argument( - "--gaf", - "-a", - dest = "gaf", - required = True, - help = "Alignement file (.gaf)" - ) -arg_parser.add_argument( - "--format", - "-f", - dest = "format", - default = "P", - help = "Output file format. (S: sam, P: paf (default))" - ) -arg_parser.add_argument( - "--threads", - "-t", - dest = "threads", - required = False, - default = 1, - type = int, - help = "Number of threads" - ) -arg_parser.add_argument( - '--version', - '-v', - action="store_true", - dest = "version", - help = "Show version" -) -args = arg_parser.parse_args() - -# Printing version -if args.version: - print(version) - os._exit(0) - -# Toolbox -def walk2path(walk): - """ - Takes a walk in a single string and returns a list of nodes id with signs (gfa v1 like) - """ - _ = re.findall(r'>\w+|<\w+', walk) - # Converting ['>..', '>..', '<..', '>..'] to '..+,..+,..-,..+' - return [f'{elem[1:]}{(elem[0] == ">")*"+"+(elem[0] == "<")*"-"}' for elem in _] - -def cigar2basealn(cigar): - """ - Takes a CIGAR string and convert it into a list of base level alignment. - For example : "345=" -> ["=", "=", ..., "="] of length 345. - """ - _ = re.findall(r'\d+\D', cigar) - final_cigar = [] - for match in _: - final_cigar += [match[-1]]*int(match[:-1]) - - return final_cigar - -def basealn2cigar(base_aln_list): - - last_elem = base_aln_list[0] - CIGAR = [[1, last_elem]] - for elem in base_aln_list[1:]: - if elem == last_elem: - CIGAR[-1][0] += 1 - - else : - CIGAR[-1][0] = str(CIGAR[-1][0]) - CIGAR.append([1, elem]) - last_elem = elem - CIGAR[-1][0] = str(CIGAR[-1][0]) - return "".join(["".join(block) for block in CIGAR if block[1] != ""]) - -# Parsing the .gaf file -print(f"[gaf2aln::GAF Parser] Reading {args.gaf} ...") -with open(args.gaf, 'r') as file: - gaf_lines = file.readlines() - -gaf_col = [ - "QRY.NAME", "QRY.LEN", "QRY.START", "QRY.END", "STRAND", - "PATH.MATCH", "PATH.LEN", "ALN.START", "ALN.END", - "RES.MATCH", "ALN.BLOCK.LEN", "MAPPING.QUAL" - ] - -# Creating dictionnary to store alignments -print(f"[gaf2aln::GAF Parser] Extracting alignments ...") -aln_dict = {} -for line in range(len(gaf_lines)): - ## Splitting the line by tabulation - line_content = gaf_lines[line][:-1].split('\t') - - ## Adding alignement info to dictionnary - aln_dict[f"ALN_{line+1}"] = { - gaf_col[i]: line_content[i] for i in range(len(gaf_col)) - } - - ## Splitting "PATH.MATCH" into a list - aln_dict[f"ALN_{line+1}"]["PATH.MATCH"] = [ - (str(node_id[:-1]), node_id[-1]) - for node_id in walk2path(aln_dict[f"ALN_{line+1}"]["PATH.MATCH"]) - ] - - ## Adding CIGAR - aln_dict[f"ALN_{line+1}"]["RAW.CIGAR"] = line_content[-1] - - ## Adding tags - aln_dict[f"ALN_{line+1}"]["TAGS"] = ",".join(line_content[13:-1]) - -# Getting nodes of interest ids -aln_nodes = np.unique([ - str(node_id) - for aln in aln_dict.keys() - for node_id, orient in aln_dict[aln]["PATH.MATCH"] -]).tolist() - -del gaf_lines, gaf_col - -# Parsing the .gfa -print(f"[gaf2aln::GFA Parser] Reading {args.gfa} ...") -with open(args.gfa, 'r') as file: - gfa_lines = file.readlines() - -# Nodes length dictionnary structured as follow : -# {<NODE.ID>: <NODE.LENGTH>} -nodes_length = {} -# Nodes dictionnary structured as follow : -# { <ALN.NODE.ID> : { -# <PATH.NAME>: {"START": start, "END": end, "STRAND": strand), -# <ALN.NAME>: {"START": start, "END": end, "S.OFF": start.offset, "E.OFF": end.offset, "STRAND": strand, "CIGAR": CIGAR} -# } -# } -nodes = {node_id: {} for node_id in aln_nodes} -# Paths dictionnary structured as follow : -# {<PATH.NAME>: {NODES: {<NODE.ID>: <NODE.ORIENT>}, CIGAR: <CIGAR in comma separated list>} -paths = {} -# Links dictionnary structured as follow : -# {<FROM.NODE.ID>: {<TO.NODE.ID>: {FROM.ORIENT: <FROM.ORIENT>, TO.ORIENT: <TO.ORIENT>}}} -links = {} - -# Parsing the gfa -print(f"[gaf2aln::GFA Parser] Extracting nodes, paths and links ...") - -def GFA_parser(gfa_lines, nodes = nodes): - _links, _nodes, _nodes_length, paths = {}, {}, {}, {} - for line in gfa_lines: - line_content = line[:-1].split("\t") - line_id = line_content[0] - - # Segment line - if line_id == "S" : - - _nodes_length[str(line_content[1])] = len(line_content[2]) - - # Link line - elif line_id == "L": - try : - _links[str(line_content[1])][str(line_content[3])] = { - "FROM": str(line_content[2]), - "TO": str(line_content[4]) - } - - except : - _links[str(line_content[1])] = { - str(line_content[3]) : {"FROM.ORIENT": str(line_content[2]), "TO.ORIENT": str(line_content[4])} - } - - # Path line - elif line_id == "P": - _paths[str(line_content[1])] = { - "NODES": { - str(node_id[:-1]): str(node_id[-1]) - for node_id in line_content[2].split(',') - }, - "CIGAR": line_content[3] - } - - return [_links, _nodes, _nodes_length, _paths] - -# splits = np.quantile(range(len(gfa_lines)+1), q= np.array(args.threads+1)/args.threads, method='higher').tolist() -# res = [] -# for i in range(1, len(splits)): -# res.append(executor.submit(GFA_parser, gfa_lines[splits[i-1]:splits[i]])) - -# for out in res: -# results = out.result() - -# for link_id, link_info in results[0].items(): -# links[] - - -for line in gfa_lines: - line_content = line[:-1].split("\t") - line_id = line_content[0] - - # Segment line - if line_id == "S" : - - nodes_length[str(line_content[1])] = len(line_content[2]) - - # Link line - elif line_id == "L": - try : - links[str(line_content[1])][str(line_content[3])] = { - "FROM": str(line_content[2]), - "TO": str(line_content[4]) - } - - except : - links[str(line_content[1])] = { - str(line_content[3]) : {"FROM.ORIENT": str(line_content[2]), "TO.ORIENT": str(line_content[4])} - } - - # Path line - elif line_id == "P": - paths[str(line_content[1])] = { - "NODES": { - str(node_id[:-1]): str(node_id[-1]) - for node_id in line_content[2].split(',') - }, - "CIGAR": line_content[3] - } - -del gfa_lines - -print(f"[gaf2aln::Graph position processing] Computing nodes positions in each paths...") -def get_node_pos(path_name, nodes = nodes, paths = paths, nodes_length = nodes_length): - print(f"[gaf2aln::Graph position processing] Running on {path_name} ...") - cur_pos = 0 - - out = {} - # Iterating over nodes in the path - for path_node in paths[path_name]["NODES"].keys(): - # Instead of checking if the node is one interesting node, we try to add to the nodes dict - if path_node in aln_nodes : - out[path_node] = { - "START": cur_pos, # Start position of the node start in the currrent path - "END": cur_pos+nodes_length[path_node], # End position of the node end in the current path - "STRAND": paths[path_name]["NODES"][path_node] # Orientation of the node in the current path - } - - cur_pos += nodes_length[path_node]+1 - else : - cur_pos += nodes_length[path_node]+1 - - return out - -res = {} -executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.threads) -# Adding nodes positions relative to path -for path_name in paths.keys(): - res[path_name] = executor.submit(get_node_pos, path_name) - -executor.shutdown(wait=True) - -for path_name, out in res.items(): - results = out.result() - for path_node, node_pos in results.items(): - nodes[path_node][path_name] = node_pos - -del res - -print(f"[gaf2aln::Alignment position processing] Computing nodes positions in each alignement...") -# Adding nodes positions relative to path - -def get_aln_node_info(aln_name, aln_dict = aln_dict, nodes_length = nodes_length): - # Initializing current position in query - cur_pos = 0 - - # Getting start and end node ids - start_end_id = (aln_dict[aln_name]["PATH.MATCH"][0][0], aln_dict[aln_name]["PATH.MATCH"][-1][0]) - - # Creating result dictionnary - res = {} - - ## Iterating over node_ids from the given alignment - for node_id, orient in aln_dict[aln_name]["PATH.MATCH"]: - # Adding entry for current node - res[node_id] = {aln_name: {}} - - # First node - if node_id == start_end_id[0]: - start_pos = 0 - s_off = int(aln_dict[aln_name]["ALN.START"]) - end_pos = nodes_length[node_id]-s_off - e_off = 0 - # End node - elif node_id == start_end_id[1]: - start_pos = cur_pos - s_off = 0 - end_pos = int(aln_dict[aln_name]["QRY.END"]) - e_off = nodes_length[node_id]-(end_pos-cur_pos) - # Node in between - else : - start_pos = cur_pos - s_off, e_off = 0, 0 - end_pos = cur_pos+nodes_length[node_id] - - res[node_id] = { - "START": start_pos, # Start position on the query - "END": end_pos, # End position on the query - "S.OFF": s_off, # Offset between the start of the alignment and the node's start - "E.OFF": e_off, # Offset between the end of the alignment and the node's end - "STRAND": orient # Orientation of the node in the alignment - } - - cur_pos = end_pos - print(start_pos, end_pos, s_off, e_off, orient, nodes_length[node_id], cur_pos) - - return res - -# Storing alignement -res = {} -executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.threads) -for aln_name in aln_dict.keys(): - print(f"[gaf2aln::Alignment position processing] Running on {aln_name} ...") - - res[aln_name] = executor.submit(get_aln_node_info, aln_name) - #res[aln_name] = get_aln_node_info(aln_name, aln_dict = aln_dict, nodes_length = nodes_length) - -executor.shutdown(wait=True) - -for aln_name, node_info in res.items(): - results = node_info.result() - for node_id, info in results.items(): - nodes[node_id][aln_name] = info - -del res - -# Calculating CIGAR for each nodes in each aln -print(f"[gaf2aln::CIGAR processing] Computing nodes cigar from alignement ...") -# Iterating over alignments -for aln in aln_dict.keys(): - - print(f"[gaf2aln::CIGAR processing] Running on {aln} ...") - # Getting the list of base level alignement (["=", "X", ...] from "1=1X...") - raw_cigar = cigar2basealn(aln_dict[aln]["RAW.CIGAR"]) - CIGAR={} - - for node_id, orient in aln_dict[aln]["PATH.MATCH"]: - - _cigar = basealn2cigar(raw_cigar[ - nodes[node_id][aln]["START"]:nodes[node_id][aln]["END"] - ]) - nodes[node_id][aln]["CIGAR"] = _cigar - #print(_cigar, nodes[node_id][aln]["START"], nodes[node_id][aln]["END"]) - -#print(nodes) - -# Lifting graph alignements to haplotype alignements - -ALNS = {} -for aln_name in aln_dict.keys(): - - for path_name in paths.keys(): - - _ = [] - for node_id, orient in aln_dict[aln_name]["PATH.MATCH"]: - - n_info = nodes[node_id] - q_start = n_info[aln_name]["START"] - q_end = n_info[aln_name]["END"] - _CG = n_info[aln_name]["CIGAR"] - - print(node_id, path_name, q_start, q_end) - if path_name in list(n_info.keys()): - print("\tIn path") - - if n_info[aln_name]["STRAND"] == n_info[path_name]["STRAND"] : - t_start = n_info[path_name]["START"]+n_info[aln_name]["S.OFF"] - t_end = n_info[path_name]["END"]+n_info[aln_name]["E.OFF"] - else : - t_end = n_info[path_name]["START"]+n_info[aln_name]["S.OFF"] - t_start = n_info[path_name]["END"]+n_info[aln_name]["E.OFF"] - - print("\t", t_start, t_end) - - # Non empty temporary list of aln and ending of the last block is the same as the start of the new node : - if len(_) and _[-1]["T.END"] == t_start and _[-1]["Q.END"] == q_start: - tmp_aln["Q.END"] = q_end - tmp_aln["T.END"] = t_end - tmp_aln["CG"] += _CG - elif len(_) and _[-1]["T.END"] == t_start: # Following on the target not on the query (i.e. Insertion) - tmp_aln["T.END"] = t_end - tmp_aln["CG"] += f"{nodes_length[node_id]}I" - elif len(_) and _[-1]["Q.END"] == q_start: # Following on the query, not on the target (i.e. Deletion) - tmp_aln["Q.END"] = q_end - tmp_aln["CG"] += f"{nodes_length[node_id]}D" - else : # Else, completely different - try : - _.append(tmp_aln) - except : pass - tmp_aln = { - "Q.START": q_start, - "Q.END": q_end, - "T.START": t_start, - "T.END": t_end, - "CG": _CG, - } - - else : - print("\tNot in path") - # Node is not in the path - - - ALNS[(path_name, aln_name)] = _ - -## Debug -for elem in ALNS.keys(): - print(elem) - -for key, elem in ALNS.items(): - print(key) - print(elem) \ No newline at end of file -- GitLab