Source code for brokilon.core.taxon_map_utils
import re
[docs]
def change_taxon_map(input_nexus, output_nexus, new_map):
# Function that will apply the new map to the input nexus and save it as output_nexus
# todo check file exists
# todo check that the map is compatible etc...
begin_map = re.compile('\t?translate\n', re.I)
end = re.compile('\t*?;\n?')
re_tree = re.compile("\t?tree .*=? (.*$)", flags=re.I | re.MULTILINE)
re_taxa = re.compile('([0-9]+)([\\[:])')
new_map_reversed = {v: k for (k, v) in new_map.items()}
old_map = {}
within_map = False
finished_map = False
with open(input_nexus, "r") as in_file, open(output_nexus, "w+") as out_file:
for line in in_file:
if begin_map.match(line):
# we enter the taxon map lines
out_file.write(line)
within_map = True
if within_map and end.match(line):
# The taxon map is finished
out_file.write(";\n")
within_map = False
finished_map = True
if within_map and not begin_map.match(line):
# We are within the taxon map, all the line need to be extracted
split = line.split()
cur_key = int(split[0])
old_map[cur_key] = split[1][:-1] if split[1][-1] == "," else split[1]
out_file.write(f"\t\t{cur_key} {new_map[cur_key]},\n")
if not finished_map and not within_map:
# Write everything that comes before the taxon map to the new file
out_file.write(line)
if re_tree.match(line):
# matching a tree, need to change the taxon integer matches accordingly
# extracting the tree newick string
tree_string = re.split(re_tree, line)[1]
# apply new taxon map ...
new_newick = re_taxa.sub(lambda m: m.group().replace(m.group(1),
str(new_map_reversed[
old_map[int(m.group(1))]])),
tree_string)
out_file.write(f"{line.split('=')[0]}= {new_newick}\n")
out_file.write("End;")
[docs]
def get_mapping_dict(file: str) -> dict:
"""
Returns the taxon mapping of the nexus file as a dictionary
:param file: A nexus file path
:type file: str
:return: Dictionary containing the mapping of taxa(values) to int(keys)
:rtype: dict {int --> str}
"""
begin_map = re.compile('\t?translate\n', re.I)
end = re.compile('\t*?;\n?')
mapping = {}
begin = False
with open(file) as f:
for line in f:
if begin:
if end.match(line):
break
split = line.split()
mapping[int(split[0])] = split[1][:-1] if split[1][-1] == "," else split[1]
if begin_map.match(line):
begin = True
return mapping