Alexis Mergez · 92513d7f
--- a/DotPlot_BED_corrector.py 0 → 100644

+ 129

− 0
+++ b/DotPlot_BED_corrector.py 0 → 100644

+ 129

− 0
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+DotPlot Bed like file corrector.
+Correct splitted paths from odgi untangle bed like file used for creating dotplots.
+See Odgi documentation for the dot plot tutorial
+
+@author: alexis.mergez@inrae.fr
+@version: 0.1
+"""
+import re
+import argparse
+import os
+import numpy as np
+import time
+import pandas as pd
+from functools import reduce
+import concurrent.futures
+import gzip
+
+version = "0.1"
+
+## Argument parser
+arg_parser = argparse.ArgumentParser(description='GFAstats: GFA statistics')
+arg_parser.add_argument(
+    "--input",
+    "-i",
+    dest = "input",
+    required = True,
+    help = "Bed like file"
+    )  
+arg_parser.add_argument(
+    "--output",
+    "-o",
+    dest = "output",
+    required = True,
+    help = "Output name"
+    )  
+arg_parser.add_argument(
+    '--version',
+    '-v',
+    action="store_true",
+    dest = "version",
+    help = "Show version"
+)
+arg_parser.add_argument(
+    '--progress',
+    '-P',
+    action="store_true",
+    dest = "progress",
+    help = "Show progress to stdout"
+)
+args = arg_parser.parse_args()
+
+# Printing version and exiting if required
+if args.version:
+    print(version)
+    os._exit(0)
+
+# importing bed file with pandas
+if args.progress : print(f"[Bed_corrector::Parsing] Reading {args.input} ...")
+bed = pd.read_csv(
+    args.input,
+    sep = '\t'
+)
+
+# Getting the queries name and checking if we get multiples for one path
+if args.progress : print(f"[Bed_corrector::Identify] Searching for splitted paths ...")
+## Getting unique query names
+queries = bed["query.name"].unique()
+
+## Extracting path name and ranges from unique queries names
+paths = [query.split(":")[0] for query in queries]
+ranges = np.array([query.split(":")[1].split("-") for query in queries])
+
+## Creating temporary dataframe to store previous info
+temp_df = pd.DataFrame({
+    "queries" : queries,
+    "path" : paths,
+    "start" : ranges[:, 0],
+    "end" : ranges[:, 1]
+})
+
+## Getting the path that are splitted (i.e. more than one unique occurence)
+splitted_paths = []
+_ = np.unique(paths, return_counts = True)
+for path, count in zip(_[0], _[1]):
+    if count > 1 : # More than a repeat
+        splitted_paths.append(path)
+        if args.progress : print(f"[Bed_corrector::Identify] {path} is splitted")
+
+# For each splitted path identified, we search the minimum start and the maximum end
+if args.progress : print(f"[Bed_corrector::Identify] Searching for min start and max end of splitted paths ...")
+min_start = {}
+max_end = {} 
+for path_name in splitted_paths:
+    min_start[path_name] = int(temp_df[temp_df.path == path_name].start.min())
+    max_end[path_name] = int(temp_df[temp_df.path == path_name].end.max())
+
+# Traversing bed dataframe and offsetting paths based on the min_start of the main path
+if args.progress : print(f"[Bed_corrector::Patching] Correcting splitted paths ...")
+## Getting a copy of columns to modify
+path_names, starts, ends = bed["query.name"].tolist(), bed["query.start"].tolist(), bed["query.end"].tolist()
+
+## Iterating over lines
+for i in range(len(path_names)):
+    ## Getting the name and the range of the current line query
+    path, ranges = path_names[i].split(":")
+
+    if path in splitted_paths:
+        #print("Before :", path_names[i], starts[i], ends[i])
+        
+        ## Computing offset based on min start for this path
+        offset = (int(ranges.split('-')[0])-min_start[path])
+        #print("Offset:", offset)
+
+        ## Patching the info with new range (min start, max end) and shifted coordinates
+        path_names[i] = f"{path}:{min_start[path]}-{max_end[path]}"
+        starts[i] = int(starts[i])+offset
+        ends[i] = int(ends[i])+offset
+        #print("After :", path_names[i], starts[i], ends[i])
+
+## Patching the bed
+bed["query.name"] = path_names
+bed["query.start"] = starts
+bed["query.end"] = ends
+
+# Exporting
+bed.to_csv(args.output, sep="\t", index = False)
+\ No newline at end of file