Skip to content

Commit

Permalink
adds taxonomy prefixes and option to choose ranks
Browse files Browse the repository at this point in the history
  • Loading branch information
mikerobeson committed Feb 6, 2025
1 parent 025f297 commit 7b0d0bb
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 9 deletions.
40 changes: 34 additions & 6 deletions rescript/get_pr2.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
from urllib.request import urlretrieve
from urllib.error import HTTPError
from pathlib import Path
from pandas import DataFrame
import pandas as pd
from collections import OrderedDict
from q2_types.feature_data import (TaxonomyFormat, DNAFASTAFormat,
DNAIterator)

Expand All @@ -28,13 +29,22 @@
# pr2_version_4.13.0_16S_mothur.fasta.gz
# pr2_version_4.13.0_18S_mothur.fasta.gz

_allowed_pr2_ranks = OrderedDict({'domain': 'd__', 'supergroup': 'sgr__',
'division': 'dv__', 'subdivision': 'dvs__',
'class': 'c__', 'order': 'o__',
'family': 'f__', 'genus': 'g__',
'species': 's__'})
_default_pr2_ranks = ['domain', 'supergroup', 'division', 'subdivision',
'class', 'order', 'family', 'genus', 'species']


def get_pr2_data(
version: str = '5.0.0',
) -> (DNAIterator, DataFrame):
ranks: list = None,
) -> (DNAIterator, pd.Series):

urls = _assemble_pr2_urls(version=version)
seqs, tax = _retrieve_data_from_pr2(urls)
seqs, tax = _retrieve_data_from_pr2(urls, ranks)

print('\n Saving files...\n')
return seqs, tax
Expand All @@ -51,7 +61,24 @@ def _assemble_pr2_urls(version='5.0.0'):
return urls_to_retrieve


def _retrieve_data_from_pr2(urls_to_retrieve):
def _compile_taxonomy_output(tax, ranks):
# prepare dataframe with all ranks:
prefix_list = list(_allowed_pr2_ranks.values())
tax[prefix_list] = \
tax['Taxon'].str.strip(';').str.split(';', expand=True)
tax.drop('Taxon', axis=1, inplace=True)
# prepend prefixes
tax.loc[:, prefix_list] = \
tax.loc[:, prefix_list].apply(lambda x: x.name + x)
# sort user defined ranks in case provided out of order
# then only return user specified ranks
sorted_ranks = [p for r, p in _allowed_pr2_ranks.items() if r in ranks]
taxonomy = tax.loc[:, sorted_ranks].agg('; '.join, axis=1)
taxonomy.rename('Taxon', inplace=True)
return taxonomy


def _retrieve_data_from_pr2(urls_to_retrieve, ranks):
# Perform check that the `urls_to_retriev` should only
# contain 2 files, a 'fasta' and 'taxonomy' file.

Expand Down Expand Up @@ -84,5 +111,6 @@ def _retrieve_data_from_pr2(urls_to_retrieve):
mode="r").view(DNAIterator)
elif out_path.endswith('tax'):
tax = TaxonomyFormat(out_path,
mode="r").view(DataFrame)
return seqs, tax
mode="r").view(pd.DataFrame)
updated_tax = _compile_taxonomy_output(tax, ranks)
return seqs, updated_tax
14 changes: 11 additions & 3 deletions rescript/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
get_ncbi_data, _default_ranks, _allowed_ranks, get_ncbi_data_protein)
from .get_gtdb import get_gtdb_data
from .get_unite import get_unite_data
from .get_pr2 import get_pr2_data
from .get_pr2 import get_pr2_data, _allowed_pr2_ranks, _default_pr2_ranks

citations = Citations.load('citations.bib', package='rescript')

Expand Down Expand Up @@ -1039,15 +1039,23 @@
inputs={},
parameters={
'version': Str % Choices(['5.0.0', '4.14.0',]),
'ranks': List[Str % Choices(_allowed_pr2_ranks)],
# 'rank_propagation': Bool,
},
outputs=[('pr2_sequences', FeatureData[Sequence]),
('pr2_taxonomy', FeatureData[Taxonomy]),],
input_descriptions={},
parameter_descriptions={
'version': 'PR2 database version to download.'},
'version': 'PR2 database version to download.',
'ranks': 'List of taxonomic ranks for building a taxonomy '
'from the PR2 Taxonomy database. '
"[default: '" +
"', '".join(_default_pr2_ranks) + "']",
},
output_descriptions={
'pr2_taxonomy': 'SSU PR2 reference taxonomy.',
'pr2_sequences': 'SSU PR2 reference sequences.'},
'pr2_sequences': 'SSU PR2 reference sequences.',
},
name='Download, parse, and import SSU PR2 reference data.',
description=(
'Download, parse, and import SSU PR2 files, given a version '
Expand Down

0 comments on commit 7b0d0bb

Please sign in to comment.