Source code for nasp.find_duplicates

#!/usr/bin/env python3

__author__ = "David Smith"
__version__ = "1.0.0"
__email__ = "dsmith@tgen.org"

import logging


def _parse_args():
    import argparse

    parser = argparse.ArgumentParser(description="Meant to be called from the pipeline automatically.")
    parser.add_argument("--nucmerpath", default="nucmer", help="Path to the 'nucmer' executable.")
    parser.add_argument("--reference", required=True, help="Path to the reference fasta file.")
    return parser.parse_args()


# TODO: This should eventually be moved to the main job manager section
[docs]def run_nucmer_on_reference(nucmer_path, reference_path): import subprocess import sys return_code = subprocess.call( [nucmer_path, "--prefix=reference", "--maxmatch", "--nosimplify", reference_path, reference_path]) if return_code > 0: sys.stderr.write( "NASP WARNING: nucmer may have encountered errors during reference duplicates checking, proceeding anyway\n")
def _parse_delta_line(line_from_delta_file, dups_data, current_contigs): import re line_match = re.match(r'^>([^ ]+) ([^ ]+) (\d+) (\d+)\s*$', line_from_delta_file) if line_match: current_contigs = ( line_match.group(1), line_match.group(2) ) contig_0_end = int(line_match.group(3)) contig_1_end = int(line_match.group(4)) dups_data.add_contig(current_contigs[0]) dups_data.add_contig(current_contigs[1]) if contig_0_end > dups_data.get_contig_length(current_contigs[0]): dups_data.extend_contig(contig_0_end, "0", current_contigs[0]) if contig_0_end > dups_data.get_contig_length(current_contigs[1]): dups_data.extend_contig(contig_1_end, "0", current_contigs[1]) else: line_match = re.match(r'^(\d+) (\d+) (\d+) (\d+) \d+ \d+ \d+\s*$', line_from_delta_file) if line_match: contig_0_start = int(line_match.group(1)) contig_0_end = int(line_match.group(2)) contig_1_start = int(line_match.group(3)) contig_1_end = int(line_match.group(4)) if ( current_contigs[0] != current_contigs[1] ) or ( ( contig_0_start != contig_1_start ) and ( contig_0_end != contig_1_end ) ): if contig_0_end < contig_0_start: contig_0_end, contig_0_start = contig_0_start, contig_0_end if contig_1_end < contig_1_start: contig_1_end, contig_1_start = contig_1_start, contig_1_end dups_data.set_value(( "1" * ( contig_0_end - contig_0_start + 1 ) ), contig_0_start, "!", current_contigs[0]) dups_data.set_value(( "1" * ( contig_1_end - contig_1_start + 1 ) ), contig_1_start, "!", current_contigs[1]) return current_contigs
[docs]def parse_delta_file(delta_filename, dups_data): current_contigs = ( "", "" ) delta_handle = open(delta_filename, 'r') for line_from_delta_file in delta_handle: current_contigs = _parse_delta_line(line_from_delta_file, dups_data, current_contigs) delta_handle.close()
[docs]def main(): from nasp.nasp_objects import GenomeStatus commandline_args = _parse_args() run_nucmer_on_reference(commandline_args.nucmerpath, commandline_args.reference) dups_data = GenomeStatus() parse_delta_file("reference.delta", dups_data) dups_data.write_to_fasta_file("duplicates.txt")
if __name__ == "__main__": main()