Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/cram stream #31

Merged
merged 42 commits into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
78d3e82
refactor: more comprehensive parse_region function, to allow for more…
AssafSternberg Mar 22, 2024
b605547
refactor: (1) more comprehensive parse_region function, to allow for …
AssafSternberg Mar 22, 2024
14374a7
fix: fixed expected tuple output of parse_region to match refactored …
AssafSternberg Mar 22, 2024
62d5da7
fix: fix parse_region examples to comply with the output of the funci…
AssafSternberg Mar 22, 2024
5c75a08
feat: add format_htsget_command function to return a valid htsget com…
AssafSternberg Mar 25, 2024
065a4a8
fix: add 'htsget' to the begining of the string output of format_htsg…
AssafSternberg Mar 25, 2024
c0446cc
fix: add missing '--' before 'end' in the example of the format_htsge…
AssafSternberg Mar 25, 2024
62bc195
fix: rename format_htsget_command function to htsget_command.
AssafSternberg Mar 25, 2024
a14df96
feat: add htsget_url function to format valid htsget URLs. Add filena…
AssafSternberg Mar 25, 2024
bb6520f
feat: Add stream_cram function that given a url and region either sli…
AssafSternberg Mar 25, 2024
f0f82d1
fix: remove 'import htsget' from cram.py, as it is not used.
AssafSternberg Mar 25, 2024
9628843
fix: changed stream_cram function name to slice_remote_cram, as strea…
AssafSternberg Mar 26, 2024
dad8a8b
feat: add stream_cram class method to MODO, to slice local or remote …
AssafSternberg Mar 28, 2024
80332b2
fix: fix wrong import of slice_cram and slice_remote_cram.
AssafSternberg Mar 28, 2024
adf87d0
Merge branch 'main' into feat/cram_stream
AssafSternberg Apr 23, 2024
cc86952
feat: add s3_endpoint and htsget_endpoint as MODO class variables. If…
AssafSternberg Apr 23, 2024
8a6ef5c
Revert "feat: add s3_endpoint and htsget_endpoint as MODO class varia…
AssafSternberg Apr 23, 2024
aa7fc97
feat: add s3_endpoint and htsget_endpoint as MODO class variables. If…
AssafSternberg Apr 23, 2024
2fd9ac4
refactor: change stream_cram to use cram_path instead of cram_name, a…
AssafSternberg Apr 23, 2024
59457a2
Revert "refactor: change stream_cram to use cram_path instead of cram…
AssafSternberg Apr 23, 2024
0a1a191
refactor: change stream_cram to use cram_path instead of cram_name, a…
AssafSternberg Apr 23, 2024
833573d
refactor: switch use of htsget API, instead of CLI, in slice_remote_c…
AssafSternberg Apr 23, 2024
7a03bf7
remove: remove file_from_url(), htsget_command(), and htsget)url(), a…
AssafSternberg Apr 23, 2024
ff382a5
chore: add htsget dependency
AssafSternberg Apr 23, 2024
5182169
fix: removed 'htsget_command' from import statement as it was removed
AssafSternberg Apr 23, 2024
1ae0bfe
Update modo/api.py
AssafSternberg Apr 24, 2024
eee35d8
fix: use named arguments in htsget.get()
AssafSternberg Apr 24, 2024
bf3102a
refactor: use named variables in htsget.get()
AssafSternberg Apr 24, 2024
3309d4a
fix: correct url parsing with urllib
almutlue Apr 25, 2024
977c945
fix: remove unintended test files
almutlue Apr 25, 2024
4d94ec9
fix: remove unintended binary file
almutlue Apr 25, 2024
8a3f7ec
refactor: parse_region now returns (chrom[str], start[int, optional],…
AssafSternberg Apr 25, 2024
263c5c2
Merge branch 'feat/cram_stream' of github.com:sdsc-ordes/smoc-api int…
AssafSternberg Apr 25, 2024
64eab6d
refactor: refactor slice_cram() to work with the new parse_region() o…
AssafSternberg Apr 25, 2024
aab8c46
fix: fix typo in parse_region() examples that caused test failure.
AssafSternberg Apr 25, 2024
a4c2bca
refactor: refactor slice_remote_cram() to work with the new parse_reg…
AssafSternberg Apr 25, 2024
24dc69e
fix: fix typo in parse_region() examples that caused test failure.
AssafSternberg Apr 25, 2024
bcc16f3
feat: add output_filename option to slice_cram(). When given, the cra…
AssafSternberg Apr 26, 2024
0b0ed93
feat: split stream_cram() method into two metods: stream_cram(), outp…
AssafSternberg Apr 26, 2024
4f25be5
fix: harmonize the output of MODO.stream_cram() method, to be a pysam…
AssafSternberg Apr 29, 2024
428c973
fix: remove unused test file
almutlue Apr 29, 2024
ec4eaab
fix: add reference filename to save_cram()
almutlue Apr 30, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions modo/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import modo_schema.datamodel as model
import s3fs
import zarr
import re

from .rdf import attrs_to_graph
from .storage import add_metadata_group, init_zarr, list_zarr_items
Expand All @@ -22,6 +23,7 @@
set_haspart_relationship,
UserElementType,
)
from .cram import slice_cram, slice_remote_cram


class MODO:
Expand Down Expand Up @@ -50,6 +52,7 @@ def __init__(
self,
path: Union[Path, str],
s3_endpoint: Optional[str] = None,
htsget_endpoint: Optional[str] = None,
id: Optional[str] = None,
name: Optional[str] = None,
description: Optional[str] = None,
Expand All @@ -58,6 +61,10 @@ def __init__(
has_assay: List = [],
source_uri: Optional[str] = None,
):
self.s3_endpoint = s3_endpoint
if s3_endpoint and not htsget_endpoint:
htsget_endpoint = re.sub(r"s3$", "htsget", s3_endpoint)
self.htsget_endpoint = htsget_endpoint
self.path = Path(path)
if s3_endpoint:
fs = s3fs.S3FileSystem(endpoint_url=s3_endpoint, anon=True)
Expand Down Expand Up @@ -370,3 +377,80 @@ def enrich_metadata(self):
self.update_element(inst_names[ele.name], ele)
else:
continue

def stream_cram(
self,
cram_path: str,
region: Optional[str] = None,
reference_filename: Optional[str] = None,
):
"""Slices both local and remote CRAM files returning an iterator."""

# check requested CRAM exists in MODO
if Path(cram_path) not in self.list_files():
raise ValueError(f"{cram_path} not found in {self.path}.")

if self.s3_endpoint:
# http://domain/s3 + bucket/modo/file.cram --> http://domain/htsget/reads/modo/file.cram
url = (
self.htsget_endpoint
+ "/reads/"
+ str(Path(*Path(cram_path).parts[1:]))
)
# str(Path(*Path(cram_path).parts[1:])) same as path.split("/", maxsplit=1)[1] but cross-platform
cram_iter = slice_remote_cram(
url=url, region=region, reference_filename=reference_filename
)
else:
# assuming user did not change directory, filepath should be the
# relative path to the file.
# for the time being, we do not check the validity of the supplied reference_filename, or
# the reference given in the CRAM header (used if refernece not supplied by user).

cram_iter = slice_cram(
path=cram_path,
region=region,
reference_filename=reference_filename,
)

return cram_iter

def save_cram(
self,
cram_path: str,
output_filename: str,
region: Optional[str] = None,
reference_filename: Optional[str] = None,
):
"""Slices the requested CRAM file, both local and remote, and writes
the output to local file"""

# check requested CRAM exists in MODO
if Path(cram_path) not in self.list_files():
raise ValueError(f"{cram_path} not found in {self.path}.")

if self.s3_endpoint:
# http://domain/s3 + bucket/modo/file.cram --> http://domain/htsget/reads/modo/file.cram
url = (
self.htsget_endpoint
+ "/reads/"
+ str(Path(*Path(cram_path).parts[1:]))
)
# str(Path(*Path(cram_path).parts[1:])) same as path.split("/", maxsplit=1)[1] but cross-platform
slice_remote_cram(
url=url,
region=region,
output_filename=output_filename,
reference_filename=reference_filename,
)
else:
# assuming user did not change directory, filepath should be the
# relative path to the file.
# for the time being, we do not check the validity of the supplied reference_filename, or
# the reference given in the CRAM header (used if refernece not supplied by user).
slice_cram(
path=cram_path,
region=region,
output_filename=output_filename,
reference_filename=reference_filename,
)
81 changes: 72 additions & 9 deletions modo/cram.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,90 @@
"""Utilities to interact with genomic intervals in CRAM files."""
from pathlib import Path
from typing import Iterator, List
from typing import Iterator, List, Optional

from pysam import (
AlignedSegment,
AlignmentFile,
AlignmentHeader,
)
from rdflib import Graph
from urllib.parse import urlparse
import modo_schema.datamodel as model

from .helpers import parse_region
import sys
import htsget
from .helpers import parse_region, bytesio_to_alignment_segments
from io import BytesIO


def slice_cram(path: str, region: str) -> Iterator[AlignedSegment]:
def slice_cram(
path: str,
region: Optional[str],
reference_filename: Optional[str] = None,
output_filename: Optional[str] = None,
) -> Iterator[AlignedSegment]:
"""Return an iterable slice of the CRAM file."""
if region:
chrom, start, end = parse_region(region)
else:
chrom, start, end = None, None, None

chrom, start, stop = parse_region(region)
cramfile = AlignmentFile(path, "rc")
cramfile = AlignmentFile(path, "rc", reference_filename=reference_filename)
cram_iter = cramfile.fetch(chrom, start, end)

iter = cramfile.fetch(chrom, start, stop)
if output_filename:
output = AlignmentFile(
output_filename,
"wc",
template=cramfile,
reference_filename=reference_filename,
)
for read in cram_iter:
output.write(read)
output.close()

return cram_iter


def slice_remote_cram(
url: str,
region: Optional[str] = None,
reference_filename: Optional[str] = None,
output_filename: Optional[str] = None,
):
"""Stream or write to a local file a slice of a remote CRAM file"""

return iter
url = urlparse(url)
url = url._replace(path=str(Path(url.path).with_suffix("")))

if region:
reference_name, start, end = parse_region(region)
else:
chrom, start, end = None, None, None

if output_filename:
with open(output_filename, "wb") as output:
htsget.get(
url=url.geturl(),
output=output,
reference_name=reference_name,
start=start,
end=end,
data_format="cram",
)
else:
htsget_response_buffer = BytesIO()
htsget.get(
url=url.geturl(),
output=htsget_response_buffer, # sys.stdout.buffer,
reference_name=reference_name,
start=start,
end=end,
data_format="cram",
)
htsget_response_buffer.seek(0)
cram__iter = bytesio_to_alignment_segments(
htsget_response_buffer, reference_filename
)
return cram__iter


def extract_cram_metadata(cram: AlignmentFile) -> List:
Expand Down
56 changes: 47 additions & 9 deletions modo/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,21 @@
from pathlib import Path
import re
import shutil
from typing import Any, Mapping, Optional
from typing import Any, Mapping, Optional, Iterator
from urllib.parse import urlparse
import zarr

import modo_schema.datamodel as model

from .introspection import get_haspart_property, get_slot_range, load_schema

from io import BytesIO
import tempfile
from pysam import (
AlignedSegment,
AlignmentFile,
)


def class_from_name(name: str):
class_names = list(load_schema().all_classes().keys())
Expand Down Expand Up @@ -192,24 +199,55 @@ def is_uri(text: str):
return False


def parse_region(region: str) -> tuple[str, int, int]:
def parse_region(region: str) -> tuple[str, Optional[int], Optional[int]]:
"""Parses an input UCSC-format region string into
(chrom, start, end).

Examples
--------
>>> parse_region('chr1:10-320')
('chr1', 10, 320)
>>> parse_region('chr-1ba:32-0100')
>>> parse_region('chr-1ba:32-100')
('chr-1ba', 32, 100)
>>> parse_region('chr1:10')
('chr1', 10, None)
>>> parse_region('chr1')
('chr1', None, None)
>>> parse_region('*')
('*', None, None)
"""

if not re.match(r"[^:]+:[0-9]+-[0-9]+", region):
# region = region.strip()
matches = re.match(r"^([^:]+)(:([0-9]+)?(-[0-9]*)?)?$", region.strip())
if not matches:
raise ValueError(
f"Invalid region format: {region}. Expected chr:start-end"
f"Invalid region format: {region}. Expected 'chr:start-end' (start/end optional)"
)

chrom, coords = region.split(":")
start, end = coords.split("-")

return (chrom, int(start), int(end))
chrom, _, start, end = matches.groups()
if start:
start = int(start)
if end:
end = int(end.replace("-", ""))

return (chrom, start, end)


def bytesio_to_alignment_segments(
bytesio_buffer, reference_filename: str
) -> Iterator[AlignedSegment]:
# Create a temporary file to write the bytesio data
with tempfile.NamedTemporaryFile() as temp_file:
# Write the contents of the BytesIO buffer to the temporary file
temp_file.write(bytesio_buffer.getvalue())

# Seek to the beginning of the temporary file
temp_file.seek(0)

# Open the temporary file as a pysam.AlignmentFile object
with AlignmentFile(
temp_file.name, "rc", reference_filename=reference_filename
) as alignment_file:
# Iterate over the alignments in the file
for alignment in alignment_file:
yield alignment
34 changes: 33 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ pysam = "^0.22.0"
modo_schema = { git = "https://github.com/sdsc-ordes/modo-schema.git", branch = "main" }
typer = "^0.9.0"
s3fs = "^2024.3.1"
htsget = "^0.2.6"

[tool.poetry.group.dev.dependencies]
pre-commit = "^3.6.0"
Expand Down
Loading