diff --git a/modo/api.py b/modo/api.py index 2a2b8f86..dd1c43ac 100644 --- a/modo/api.py +++ b/modo/api.py @@ -10,6 +10,7 @@ import modo_schema.datamodel as model import s3fs import zarr +import re from .rdf import attrs_to_graph from .storage import add_metadata_group, init_zarr, list_zarr_items @@ -22,6 +23,7 @@ set_haspart_relationship, UserElementType, ) +from .cram import slice_cram, slice_remote_cram class MODO: @@ -50,6 +52,7 @@ def __init__( self, path: Union[Path, str], s3_endpoint: Optional[str] = None, + htsget_endpoint: Optional[str] = None, id: Optional[str] = None, name: Optional[str] = None, description: Optional[str] = None, @@ -58,6 +61,10 @@ def __init__( has_assay: List = [], source_uri: Optional[str] = None, ): + self.s3_endpoint = s3_endpoint + if s3_endpoint and not htsget_endpoint: + htsget_endpoint = re.sub(r"s3$", "htsget", s3_endpoint) + self.htsget_endpoint = htsget_endpoint self.path = Path(path) if s3_endpoint: fs = s3fs.S3FileSystem(endpoint_url=s3_endpoint, anon=True) @@ -370,3 +377,80 @@ def enrich_metadata(self): self.update_element(inst_names[ele.name], ele) else: continue + + def stream_cram( + self, + cram_path: str, + region: Optional[str] = None, + reference_filename: Optional[str] = None, + ): + """Slices both local and remote CRAM files returning an iterator.""" + + # check requested CRAM exists in MODO + if Path(cram_path) not in self.list_files(): + raise ValueError(f"{cram_path} not found in {self.path}.") + + if self.s3_endpoint: + # http://domain/s3 + bucket/modo/file.cram --> http://domain/htsget/reads/modo/file.cram + url = ( + self.htsget_endpoint + + "/reads/" + + str(Path(*Path(cram_path).parts[1:])) + ) + # str(Path(*Path(cram_path).parts[1:])) same as path.split("/", maxsplit=1)[1] but cross-platform + cram_iter = slice_remote_cram( + url=url, region=region, reference_filename=reference_filename + ) + else: + # assuming user did not change directory, filepath should be the + # relative path to the file. + # for the time being, we do not check the validity of the supplied reference_filename, or + # the reference given in the CRAM header (used if refernece not supplied by user). + + cram_iter = slice_cram( + path=cram_path, + region=region, + reference_filename=reference_filename, + ) + + return cram_iter + + def save_cram( + self, + cram_path: str, + output_filename: str, + region: Optional[str] = None, + reference_filename: Optional[str] = None, + ): + """Slices the requested CRAM file, both local and remote, and writes + the output to local file""" + + # check requested CRAM exists in MODO + if Path(cram_path) not in self.list_files(): + raise ValueError(f"{cram_path} not found in {self.path}.") + + if self.s3_endpoint: + # http://domain/s3 + bucket/modo/file.cram --> http://domain/htsget/reads/modo/file.cram + url = ( + self.htsget_endpoint + + "/reads/" + + str(Path(*Path(cram_path).parts[1:])) + ) + # str(Path(*Path(cram_path).parts[1:])) same as path.split("/", maxsplit=1)[1] but cross-platform + slice_remote_cram( + url=url, + region=region, + output_filename=output_filename, + reference_filename=reference_filename, + ) + else: + # assuming user did not change directory, filepath should be the + # relative path to the file. + # for the time being, we do not check the validity of the supplied reference_filename, or + # the reference given in the CRAM header (used if refernece not supplied by user). + slice_cram( + path=cram_path, + region=region, + output_filename=output_filename, + reference_filename=reference_filename, + ) diff --git a/modo/cram.py b/modo/cram.py index 6e1d5ee4..fb13b161 100644 --- a/modo/cram.py +++ b/modo/cram.py @@ -1,27 +1,90 @@ """Utilities to interact with genomic intervals in CRAM files.""" from pathlib import Path -from typing import Iterator, List +from typing import Iterator, List, Optional from pysam import ( AlignedSegment, AlignmentFile, - AlignmentHeader, ) -from rdflib import Graph +from urllib.parse import urlparse import modo_schema.datamodel as model -from .helpers import parse_region +import sys +import htsget +from .helpers import parse_region, bytesio_to_alignment_segments +from io import BytesIO -def slice_cram(path: str, region: str) -> Iterator[AlignedSegment]: +def slice_cram( + path: str, + region: Optional[str], + reference_filename: Optional[str] = None, + output_filename: Optional[str] = None, +) -> Iterator[AlignedSegment]: """Return an iterable slice of the CRAM file.""" + if region: + chrom, start, end = parse_region(region) + else: + chrom, start, end = None, None, None - chrom, start, stop = parse_region(region) - cramfile = AlignmentFile(path, "rc") + cramfile = AlignmentFile(path, "rc", reference_filename=reference_filename) + cram_iter = cramfile.fetch(chrom, start, end) - iter = cramfile.fetch(chrom, start, stop) + if output_filename: + output = AlignmentFile( + output_filename, + "wc", + template=cramfile, + reference_filename=reference_filename, + ) + for read in cram_iter: + output.write(read) + output.close() + + return cram_iter + + +def slice_remote_cram( + url: str, + region: Optional[str] = None, + reference_filename: Optional[str] = None, + output_filename: Optional[str] = None, +): + """Stream or write to a local file a slice of a remote CRAM file""" - return iter + url = urlparse(url) + url = url._replace(path=str(Path(url.path).with_suffix(""))) + + if region: + reference_name, start, end = parse_region(region) + else: + chrom, start, end = None, None, None + + if output_filename: + with open(output_filename, "wb") as output: + htsget.get( + url=url.geturl(), + output=output, + reference_name=reference_name, + start=start, + end=end, + data_format="cram", + ) + else: + htsget_response_buffer = BytesIO() + htsget.get( + url=url.geturl(), + output=htsget_response_buffer, # sys.stdout.buffer, + reference_name=reference_name, + start=start, + end=end, + data_format="cram", + ) + htsget_response_buffer.seek(0) + cram__iter = bytesio_to_alignment_segments( + htsget_response_buffer, reference_filename + ) + return cram__iter def extract_cram_metadata(cram: AlignmentFile) -> List: diff --git a/modo/helpers.py b/modo/helpers.py index 68591e3d..d061416f 100644 --- a/modo/helpers.py +++ b/modo/helpers.py @@ -2,7 +2,7 @@ from pathlib import Path import re import shutil -from typing import Any, Mapping, Optional +from typing import Any, Mapping, Optional, Iterator from urllib.parse import urlparse import zarr @@ -10,6 +10,13 @@ from .introspection import get_haspart_property, get_slot_range, load_schema +from io import BytesIO +import tempfile +from pysam import ( + AlignedSegment, + AlignmentFile, +) + def class_from_name(name: str): class_names = list(load_schema().all_classes().keys()) @@ -192,7 +199,7 @@ def is_uri(text: str): return False -def parse_region(region: str) -> tuple[str, int, int]: +def parse_region(region: str) -> tuple[str, Optional[int], Optional[int]]: """Parses an input UCSC-format region string into (chrom, start, end). @@ -200,16 +207,47 @@ def parse_region(region: str) -> tuple[str, int, int]: -------- >>> parse_region('chr1:10-320') ('chr1', 10, 320) - >>> parse_region('chr-1ba:32-0100') + >>> parse_region('chr-1ba:32-100') ('chr-1ba', 32, 100) + >>> parse_region('chr1:10') + ('chr1', 10, None) + >>> parse_region('chr1') + ('chr1', None, None) + >>> parse_region('*') + ('*', None, None) """ - if not re.match(r"[^:]+:[0-9]+-[0-9]+", region): + # region = region.strip() + matches = re.match(r"^([^:]+)(:([0-9]+)?(-[0-9]*)?)?$", region.strip()) + if not matches: raise ValueError( - f"Invalid region format: {region}. Expected chr:start-end" + f"Invalid region format: {region}. Expected 'chr:start-end' (start/end optional)" ) - chrom, coords = region.split(":") - start, end = coords.split("-") - - return (chrom, int(start), int(end)) + chrom, _, start, end = matches.groups() + if start: + start = int(start) + if end: + end = int(end.replace("-", "")) + + return (chrom, start, end) + + +def bytesio_to_alignment_segments( + bytesio_buffer, reference_filename: str +) -> Iterator[AlignedSegment]: + # Create a temporary file to write the bytesio data + with tempfile.NamedTemporaryFile() as temp_file: + # Write the contents of the BytesIO buffer to the temporary file + temp_file.write(bytesio_buffer.getvalue()) + + # Seek to the beginning of the temporary file + temp_file.seek(0) + + # Open the temporary file as a pysam.AlignmentFile object + with AlignmentFile( + temp_file.name, "rc", reference_filename=reference_filename + ) as alignment_file: + # Iterate over the alignments in the file + for alignment in alignment_file: + yield alignment diff --git a/poetry.lock b/poetry.lock index 24d5f9f2..aa94da74 100644 --- a/poetry.lock +++ b/poetry.lock @@ -772,6 +772,36 @@ files = [ {file = "hbreader-0.9.1.tar.gz", hash = "sha256:d2c132f8ba6276d794c66224c3297cec25c8079d0a4cf019c061611e0a3b94fa"}, ] +[[package]] +name = "htsget" +version = "0.2.6" +description = "Python API and command line interface for the GA4GH htsget API." +optional = false +python-versions = "*" +files = [ + {file = "htsget-0.2.6-py2.py3-none-any.whl", hash = "sha256:0cca50c9aa6708c5a1d1df10b557fcac7100266ad64a78555337a357f02148a0"}, + {file = "htsget-0.2.6.tar.gz", hash = "sha256:84816534e2740d3bcc4fb44a6d703b387b8a392209be098752466c35ff253884"}, +] + +[package.dependencies] +humanize = "*" +requests = "*" +six = "*" + +[[package]] +name = "humanize" +version = "4.9.0" +description = "Python humanize utilities" +optional = false +python-versions = ">=3.8" +files = [ + {file = "humanize-4.9.0-py3-none-any.whl", hash = "sha256:ce284a76d5b1377fd8836733b983bfb0b76f1aa1c090de2566fcf008d7f6ab16"}, + {file = "humanize-4.9.0.tar.gz", hash = "sha256:582a265c931c683a7e9b8ed9559089dea7edcf6cc95be39a3cbc2c5d5ac2bcfa"}, +] + +[package.extras] +tests = ["freezegun", "pytest", "pytest-cov"] + [[package]] name = "identify" version = "2.5.35" @@ -1670,6 +1700,7 @@ description = "A pure Python implementation of the trie data structure." optional = false python-versions = "*" files = [ + {file = "PyTrie-0.4.0-py3-none-any.whl", hash = "sha256:f687c224ee8c66cda8e8628a903011b692635ffbb08d4b39c5f92b18eb78c950"}, {file = "PyTrie-0.4.0.tar.gz", hash = "sha256:8f4488f402d3465993fb6b6efa09866849ed8cda7903b50647b7d0342b805379"}, ] @@ -1701,6 +1732,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2244,4 +2276,4 @@ jupyter = ["ipytree (>=0.2.2)", "ipywidgets (>=8.0.0)", "notebook"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "dfd44b35c33758f1b138c328bf42aa8957e283c1c963df502c647016a85f8965" +content-hash = "ef9d661c36d96c02b48287eeafa3f5839ee5e43add9200ff48e84606f255f683" diff --git a/pyproject.toml b/pyproject.toml index 8c24de17..3a0cc01d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ pysam = "^0.22.0" modo_schema = { git = "https://github.com/sdsc-ordes/modo-schema.git", branch = "main" } typer = "^0.9.0" s3fs = "^2024.3.1" +htsget = "^0.2.6" [tool.poetry.group.dev.dependencies] pre-commit = "^3.6.0"