Skip to content
Snippets Groups Projects

Added DotPlot_BED_corrector.py

Merged Alexis Mergez requested to merge bed_corrector into main
1 file
+ 129
0
Compare changes
  • Side-by-side
  • Inline
+ 129
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
DotPlot Bed like file corrector.
Correct splitted paths from odgi untangle bed like file used for creating dotplots.
See Odgi documentation for the dot plot tutorial
@author: alexis.mergez@inrae.fr
@version: 0.1
"""
import re
import argparse
import os
import numpy as np
import time
import pandas as pd
from functools import reduce
import concurrent.futures
import gzip
version = "0.1"
## Argument parser
arg_parser = argparse.ArgumentParser(description='GFAstats: GFA statistics')
arg_parser.add_argument(
"--input",
"-i",
dest = "input",
required = True,
help = "Bed like file"
)
arg_parser.add_argument(
"--output",
"-o",
dest = "output",
required = True,
help = "Output name"
)
arg_parser.add_argument(
'--version',
'-v',
action="store_true",
dest = "version",
help = "Show version"
)
arg_parser.add_argument(
'--progress',
'-P',
action="store_true",
dest = "progress",
help = "Show progress to stdout"
)
args = arg_parser.parse_args()
# Printing version and exiting if required
if args.version:
print(version)
os._exit(0)
# importing bed file with pandas
if args.progress : print(f"[Bed_corrector::Parsing] Reading {args.input} ...")
bed = pd.read_csv(
args.input,
sep = '\t'
)
# Getting the queries name and checking if we get multiples for one path
if args.progress : print(f"[Bed_corrector::Identify] Searching for splitted paths ...")
## Getting unique query names
queries = bed["query.name"].unique()
## Extracting path name and ranges from unique queries names
paths = [query.split(":")[0] for query in queries]
ranges = np.array([query.split(":")[1].split("-") for query in queries])
## Creating temporary dataframe to store previous info
temp_df = pd.DataFrame({
"queries" : queries,
"path" : paths,
"start" : ranges[:, 0],
"end" : ranges[:, 1]
})
## Getting the path that are splitted (i.e. more than one unique occurence)
splitted_paths = []
_ = np.unique(paths, return_counts = True)
for path, count in zip(_[0], _[1]):
if count > 1 : # More than a repeat
splitted_paths.append(path)
if args.progress : print(f"[Bed_corrector::Identify] {path} is splitted")
# For each splitted path identified, we search the minimum start and the maximum end
if args.progress : print(f"[Bed_corrector::Identify] Searching for min start and max end of splitted paths ...")
min_start = {}
max_end = {}
for path_name in splitted_paths:
min_start[path_name] = int(temp_df[temp_df.path == path_name].start.min())
max_end[path_name] = int(temp_df[temp_df.path == path_name].end.max())
# Traversing bed dataframe and offsetting paths based on the min_start of the main path
if args.progress : print(f"[Bed_corrector::Patching] Correcting splitted paths ...")
## Getting a copy of columns to modify
path_names, starts, ends = bed["query.name"].tolist(), bed["query.start"].tolist(), bed["query.end"].tolist()
## Iterating over lines
for i in range(len(path_names)):
## Getting the name and the range of the current line query
path, ranges = path_names[i].split(":")
if path in splitted_paths:
#print("Before :", path_names[i], starts[i], ends[i])
## Computing offset based on min start for this path
offset = (int(ranges.split('-')[0])-min_start[path])
#print("Offset:", offset)
## Patching the info with new range (min start, max end) and shifted coordinates
path_names[i] = f"{path}:{min_start[path]}-{max_end[path]}"
starts[i] = int(starts[i])+offset
ends[i] = int(ends[i])+offset
#print("After :", path_names[i], starts[i], ends[i])
## Patching the bed
bed["query.name"] = path_names
bed["query.start"] = starts
bed["query.end"] = ends
# Exporting
bed.to_csv(args.output, sep="\t", index = False)
\ No newline at end of file
Loading