Source code for nasp.convert_external_genome

#!/usr/bin/env python3

__author__ = "David Smith"
__version__ = "1.0.0"
__email__ = "dsmith@tgen.org"

import logging


def _parse_args():
    import argparse

    parser = argparse.ArgumentParser(description="Meant to be called from the pipeline automatically.")
    parser.add_argument("--nucmerpath", default="nucmer", help="Path to the 'nucmer' executable.")
    parser.add_argument("--nucmerargs", default="", help="Optional arguments to pass to the 'nucmer' executable.")
    parser.add_argument("--deltafilterpath", default="delta-filter", help="Path to the 'delta-filter' executable.")
    parser.add_argument("--deltafilterargs", default="", help="Optional arguments to pass to the 'delta-filter' executable.")
    parser.add_argument("--reference", required=True, help="Path to the reference fasta file.")
    parser.add_argument("--external", required=True, help="Path to the external genome fasta file.")
    parser.add_argument("--name", default="", help="Name of this external genome.")
    return parser.parse_args()


# This should eventually be moved to the main job manager section
[docs]def generate_delta_file(nucmer_path, nucmer_args, delta_filter_path, delta_filter_args, external_nickname, reference_path, external_path): import subprocess import sys return_code = subprocess.call( [nucmer_path, ( "--prefix=" + external_nickname ), reference_path, external_path] + nucmer_args.split()) if return_code > 0: sys.stderr.write( "NASP WARNING: nucmer may have encountered errors while processing external genome '" + external_nickname + "', proceeding anyway\n") filtered_delta_handle = open(( external_nickname + ".filtered.delta" ), "w") return_code = subprocess.call([delta_filter_path, "-q", "-r", "-o", "100"] + delta_filter_args.split() + [(external_nickname + ".delta")], stdout=filtered_delta_handle) if return_code > 0: sys.stderr.write( "NASP WARNING: delta-filter may have encountered errors while processing external genome '" + external_nickname + "', proceeding anyway\n") filtered_delta_handle.close()
def _update_genome_from_delta_data(franken_genome, external_genome, parser_state, distance_covered, is_external_insert): from nasp.nasp_objects import Genome if distance_covered == -1: distance_covered = parser_state['final_pos'] - parser_state['reference_pos'] + 1 is_external_insert = True if distance_covered > 0: if parser_state['external_is_reversed']: matching_segment = Genome.reverse_complement(''.join( external_genome.get_call(( parser_state['external_pos'] - distance_covered + 1 ), parser_state['external_pos']))) else: matching_segment = ''.join(external_genome.get_call(parser_state['external_pos'], ( parser_state['external_pos'] + distance_covered - 1 ))) franken_genome.set_call(matching_segment, parser_state['reference_pos'], 'X') parser_state['reference_pos'] = parser_state['reference_pos'] + distance_covered parser_state['external_pos'] = parser_state['external_pos'] + ( -distance_covered if parser_state['external_is_reversed'] else distance_covered ) if is_external_insert: parser_state['external_pos'] += -1 if parser_state['external_is_reversed'] else 1 else: franken_genome.set_call('.', parser_state['reference_pos'], '!') parser_state['reference_pos'] += 1 return parser_state def _parse_delta_line(line_from_delta_file, franken_genome, external_genome, parser_state): import re line_match = re.match(r'^>([^ ]+) ([^ ]+) (\d+) \d+\s*$', line_from_delta_file) if line_match: current_contig = line_match.group(1) external_genome.set_current_contig(line_match.group(2)) parser_state['contig_sizes'][current_contig] = int(line_match.group(3)) franken_genome.add_contig(current_contig) else: line_match = re.match(r'^(\d+) (\d+) (\d+) (\d+) \d+ \d+ \d+\s*$', line_from_delta_file) if line_match: parser_state['reference_pos'] = int(line_match.group(1)) parser_state['final_pos'] = int(line_match.group(2)) parser_state['external_pos'] = int(line_match.group(3)) parser_state['external_is_reversed'] = ( True if ( parser_state['external_pos'] > int(line_match.group(4)) ) else False ) else: line_match = re.match(r'^(\-?)(\d+)\s*$', line_from_delta_file) if line_match: distance_covered = int(line_match.group(2)) - 1 is_external_insert = ( True if ( line_match.group(1) == '-' ) else False ) parser_state = _update_genome_from_delta_data(franken_genome, external_genome, parser_state, distance_covered, is_external_insert) return parser_state
[docs]def parse_delta_file(delta_filename, franken_genome, external_genome): parser_state = dict(zip(['contig_sizes', 'reference_pos', 'external_pos', 'final_pos', 'external_is_reversed'], [dict(), None, None, None, None])) delta_handle = open(delta_filename, 'r') for line_from_delta_file in delta_handle: parser_state = _parse_delta_line(line_from_delta_file, franken_genome, external_genome, parser_state) delta_handle.close() for current_contig in franken_genome.get_contigs(): franken_genome.extend_contig(parser_state['contig_sizes'][current_contig], 'X', current_contig)
[docs]def main(): import subprocess from nasp.nasp_objects import Genome, GenomeMeta commandline_args = _parse_args() external_nickname = commandline_args.name if commandline_args.name else GenomeMeta.generate_nickname_from_filename( commandline_args.external) #external_genome = Genome() #external_genome.import_fasta_file(commandline_args.external) generate_delta_file(commandline_args.nucmerpath, commandline_args.nucmerargs, commandline_args.deltafilterpath, commandline_args.deltafilterargs, external_nickname, commandline_args.reference, commandline_args.external) #franken_genome = Genome() #parse_delta_file(( external_nickname + ".filtered.delta" ), franken_genome, external_genome) #franken_genome.write_to_fasta_file(external_nickname + ".frankenfasta", "franken::") import pkg_resources nasptool_path = pkg_resources.resource_filename('nasp', 'nasptool_linux_64') with open(external_nickname + ".frankenfasta", 'w') as handle: return_code = subprocess.call([nasptool_path, "frankenfasta", external_nickname + ".filtered.delta"], stdout=handle)
if __name__ == "__main__": main()