feat: add s3_endpoint and htsget_endpoint as MODO class variables. If…

… htsget_endpoint not given, it is asumed to have the same domain as s3_endpoint, or None (if no s3_endpoint is given)
sdsc-ordes · Apr 23, 2024 · cc86952 · cc86952
1 parent adf87d0
commit cc86952
Showing 1 changed file with 26 additions and 29 deletions.
diff --git a/modo/api.py b/modo/api.py
@@ -42,16 +42,15 @@ class MODO:
     ['/sample/sample1']
 
     # List files in the archive
-    >>> files = sorted([file.name for file in demo.list_files()])
-    >>> assert 'demo1.cram' in files
-    >>> assert 'reference1.fa' in files
+    >>> files = sorted([file.name for es
 
     """
 
     def __init__(
         self,
         path: Union[Path, str],
         s3_endpoint: Optional[str] = None,
+        htsget_endpoint: Optional[str] = None,
         id: Optional[str] = None,
         name: Optional[str] = None,
         description: Optional[str] = None,
@@ -60,6 +59,10 @@ def __init__(
         has_assay: List = [],
         source_uri: Optional[str] = None,
     ):
+        self.s3_endpoint = s3_endpoint
+        if s3_endpoint and not htsget_endpoint:
+            htsget_endpoint = re.sub(r"s3$", "htsget", s3_endpoint)
+        self.htsget_endpoint = htsget_endpoint
         self.path = Path(path)
         if s3_endpoint:
             fs = s3fs.S3FileSystem(endpoint_url=s3_endpoint, anon=True)
@@ -74,10 +77,10 @@ def __init__(
                 return
         else:
             fs = None
-        # Opening existing object
+            # Opening existing object
         if (self.path / "data.zarr").exists():
             self.archive = zarr.open(str(self.path / "data.zarr"))
-        # Creating from scratch
+            # Creating from scratch
         else:
             self.archive = init_zarr(self.path, fs)
             self.id = id or self.path.name
@@ -188,7 +191,7 @@ def remove_element(self, element_id: str):
             print(f"Available elements are {keys}")
             raise err
 
-        # Remove data file
+            # Remove data file
         if "data_path" in attrs.keys():
             data_file = self.path / attrs["data_path"]
             if data_file.exists():
@@ -204,7 +207,7 @@ def remove_element(self, element_id: str):
                     f"INFO: Permanently deleted {data_file} from remote filesystem."
                 )
 
-        # Remove element group
+                # Remove element group
         del self.archive[element_id]
 
         # Remove links from other elements
@@ -234,20 +237,20 @@ def add_element(
         Parameters
         ----------
         element
-            Element to add to the archive.
+                Element to add to the archive.
         data_file
-            File to associate with the element.
+                File to associate with the element.
         part_of
-            Id of the parent element. It must be scoped to the type.
-            For example "sample/foo".
+                Id of the parent element. It must be scoped to the type.
+                For example "sample/foo".
         """
         # Check that ID does not exist in modo
         if element.id in [Path(id).name for id in self.metadata.keys()]:
             raise ValueError(
                 f"Please specify a unique ID. Element with ID {element.id} already exist."
             )
 
-        # Copy data file to archive and update data_path in metadata
+            # Copy data file to archive and update data_path in metadata
         fs = (
             self.archive.store.fs
             if isinstance(self.archive.store, zarr.storage.FSStore)
@@ -290,7 +293,7 @@ def _add_any_element(
                 f"Please specify a unique ID. Element with ID {element.id} already exist."
             )
 
-        # Copy data file to archive and update data_path in metadata
+            # Copy data file to archive and update data_path in metadata
         fs = (
             self.archive.store.fs
             if isinstance(self.archive.store, zarr.storage.FSStore)
@@ -326,9 +329,9 @@ def update_element(
         Parameters
         -----------------
         element_id
-            Full id path in the zarr store.
+                Full id path in the zarr store.
         new
-            Element containing the enriched metadata.
+                Element containing the enriched metadata.
         """
         attrs = self.archive[element_id].attrs
         attr_dict = attrs.asdict()
@@ -374,31 +377,25 @@ def enrich_metadata(self):
                     continue
 
     def stream_cram(
-        self, cram_name: str, region: str = None, output: str = None
+        self, cram_path: str, region: str = None, output: str = None
     ):
         """Slices and streams the requested CRAM file, both local and remote,
         and either outputs a data stream or writes data to local file"""
 
         # check requested CRAM exists in MODO
-        path = ""
-        filepaths = list(self.list_files())
-        for filepath in filepaths:
-            if cram_name == str(filepath).split("/")[-1]:
-                path = filepath
-                break
-        if path == "":
-            raise ValueError(f"{cram_name} not fount in {self.path}.")
+        if Path(cram_path) not in self.list_files():
+            raise ValueError(f"{cram_path} not found in {self.path}.")
 
         if self.s3_endpoint:
-            # http://domain/s3 + bucket/modo/file.cram --> http://domain/htsget/reads/modo/file.cram
             url = (
-                re.sub(r"s3$", "", self.s3_endpoint)
-                + "htsget/reads/"
-                + path.split("/", maxsplit=1)
+                self.htsget_endpoint
+                + "/reads/"
+                + str(Path(*Path(cram_path).parts[1:]))
             )
+            # str(Path(*Path(cram_path).parts[1:])) same as path.split("/", maxsplit=1)[1] but cross-platform
             slice_remote_cram(url, region, output)
         else:
             # assuming user did not change directory, filepath should be the
             # relative path to the file.
-            iter = slice_cram(path, region)
+            iter = slice_cram(cram_path, region)
             return iter