Skip to content

Commit

Permalink
feat: add s3_endpoint and htsget_endpoint as MODO class variables. If…
Browse files Browse the repository at this point in the history
… htsget_endpoint not given, it is asumed to have the same domain as s3_endpoint, or None (if no s3_endpoint is given)
  • Loading branch information
AssafSternberg committed Apr 23, 2024
1 parent adf87d0 commit cc86952
Showing 1 changed file with 26 additions and 29 deletions.
55 changes: 26 additions & 29 deletions modo/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,15 @@ class MODO:
['/sample/sample1']
# List files in the archive
>>> files = sorted([file.name for file in demo.list_files()])
>>> assert 'demo1.cram' in files
>>> assert 'reference1.fa' in files
>>> files = sorted([file.name for es
"""

def __init__(
self,
path: Union[Path, str],
s3_endpoint: Optional[str] = None,
htsget_endpoint: Optional[str] = None,
id: Optional[str] = None,
name: Optional[str] = None,
description: Optional[str] = None,
Expand All @@ -60,6 +59,10 @@ def __init__(
has_assay: List = [],
source_uri: Optional[str] = None,
):
self.s3_endpoint = s3_endpoint
if s3_endpoint and not htsget_endpoint:
htsget_endpoint = re.sub(r"s3$", "htsget", s3_endpoint)
self.htsget_endpoint = htsget_endpoint
self.path = Path(path)
if s3_endpoint:
fs = s3fs.S3FileSystem(endpoint_url=s3_endpoint, anon=True)
Expand All @@ -74,10 +77,10 @@ def __init__(
return
else:
fs = None
# Opening existing object
# Opening existing object
if (self.path / "data.zarr").exists():
self.archive = zarr.open(str(self.path / "data.zarr"))
# Creating from scratch
# Creating from scratch
else:
self.archive = init_zarr(self.path, fs)
self.id = id or self.path.name
Expand Down Expand Up @@ -188,7 +191,7 @@ def remove_element(self, element_id: str):
print(f"Available elements are {keys}")
raise err

# Remove data file
# Remove data file
if "data_path" in attrs.keys():
data_file = self.path / attrs["data_path"]
if data_file.exists():
Expand All @@ -204,7 +207,7 @@ def remove_element(self, element_id: str):
f"INFO: Permanently deleted {data_file} from remote filesystem."
)

# Remove element group
# Remove element group
del self.archive[element_id]

# Remove links from other elements
Expand Down Expand Up @@ -234,20 +237,20 @@ def add_element(
Parameters
----------
element
Element to add to the archive.
Element to add to the archive.
data_file
File to associate with the element.
File to associate with the element.
part_of
Id of the parent element. It must be scoped to the type.
For example "sample/foo".
Id of the parent element. It must be scoped to the type.
For example "sample/foo".
"""
# Check that ID does not exist in modo
if element.id in [Path(id).name for id in self.metadata.keys()]:
raise ValueError(
f"Please specify a unique ID. Element with ID {element.id} already exist."
)

# Copy data file to archive and update data_path in metadata
# Copy data file to archive and update data_path in metadata
fs = (
self.archive.store.fs
if isinstance(self.archive.store, zarr.storage.FSStore)
Expand Down Expand Up @@ -290,7 +293,7 @@ def _add_any_element(
f"Please specify a unique ID. Element with ID {element.id} already exist."
)

# Copy data file to archive and update data_path in metadata
# Copy data file to archive and update data_path in metadata
fs = (
self.archive.store.fs
if isinstance(self.archive.store, zarr.storage.FSStore)
Expand Down Expand Up @@ -326,9 +329,9 @@ def update_element(
Parameters
-----------------
element_id
Full id path in the zarr store.
Full id path in the zarr store.
new
Element containing the enriched metadata.
Element containing the enriched metadata.
"""
attrs = self.archive[element_id].attrs
attr_dict = attrs.asdict()
Expand Down Expand Up @@ -374,31 +377,25 @@ def enrich_metadata(self):
continue

def stream_cram(
self, cram_name: str, region: str = None, output: str = None
self, cram_path: str, region: str = None, output: str = None
):
"""Slices and streams the requested CRAM file, both local and remote,
and either outputs a data stream or writes data to local file"""

# check requested CRAM exists in MODO
path = ""
filepaths = list(self.list_files())
for filepath in filepaths:
if cram_name == str(filepath).split("/")[-1]:
path = filepath
break
if path == "":
raise ValueError(f"{cram_name} not fount in {self.path}.")
if Path(cram_path) not in self.list_files():
raise ValueError(f"{cram_path} not found in {self.path}.")

if self.s3_endpoint:
# http://domain/s3 + bucket/modo/file.cram --> http://domain/htsget/reads/modo/file.cram
url = (
re.sub(r"s3$", "", self.s3_endpoint)
+ "htsget/reads/"
+ path.split("/", maxsplit=1)
self.htsget_endpoint
+ "/reads/"
+ str(Path(*Path(cram_path).parts[1:]))
)
# str(Path(*Path(cram_path).parts[1:])) same as path.split("/", maxsplit=1)[1] but cross-platform
slice_remote_cram(url, region, output)
else:
# assuming user did not change directory, filepath should be the
# relative path to the file.
iter = slice_cram(path, region)
iter = slice_cram(cram_path, region)
return iter

0 comments on commit cc86952

Please sign in to comment.