diff --git a/docs/api.rst b/docs/api.rst index 7dd6e25..9c6c85d 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -11,6 +11,10 @@ Top Level Functions read_parquet read_json + read_avro + get_parquet_schema + get_json_schema + get_avro_schema Accessor ~~~~~~~~ diff --git a/src/akimbo/__init__.py b/src/akimbo/__init__.py index a92ec03..b3facd1 100644 --- a/src/akimbo/__init__.py +++ b/src/akimbo/__init__.py @@ -1,16 +1,20 @@ from __future__ import annotations -from awkward import ( # re-export - behavior, - metadata_from_parquet, - mixin_class, - mixin_class_method, -) +from awkward import behavior +from awkward import metadata_from_parquet as get_parquet_schema # re-export +from awkward import mixin_class, mixin_class_method import akimbo.datetimes as datetimes import akimbo.mixin as mixin import akimbo.strings as strings -from akimbo.io import join, read_avro, read_json, read_parquet +from akimbo.io import ( + get_avro_schema, + get_json_schema, + join, + read_avro, + read_json, + read_parquet, +) from akimbo.version import version as __version__ # noqa __all__ = ( @@ -23,6 +27,8 @@ "behavior", "mixin_class", "mixin_class_method", - "metadata_from_parquet", + "get_parquet_schema", + "get_json_schema", + "get_avro_schema", "strings", ) diff --git a/src/akimbo/mixin.py b/src/akimbo/mixin.py index 70a3b9b..5c20ed1 100644 --- a/src/akimbo/mixin.py +++ b/src/akimbo/mixin.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import functools import operator from typing import Callable, Iterable @@ -250,7 +252,35 @@ def unmerge(self): out = {k: self.to_output(arr[k]) for k in arr.fields} return self.dataframe_type(out) - def join(self, other, key: str, colname="match", sort=False, rkey=None): + def join( + self, + other, + key: str, + colname: str = "match", + sort: bool = False, + rkey: str | None = None, + numba: bool = True, + ): + """DB ORM-style left join to other dataframe/series with nesting but no copy + + Related records of the ``other`` table will appear as a list under the new field + ``colname`` for all matching keys. This is the speed and memory efficient way + to doing a pandas-style merge/join, which explodes out the values to a much + bigger memory footprint. + + Parameters + ---------- + other: series or table + key: name of the field in this table to match on + colname: the field that will be added to each record. This field will exist even + if there are no matches, in which case the list will be empty. + sort: if False, assumes that they key is sorted in both tables. If True, an + argsort is performed first, and the match is done by indexing. This may be + significantly slower. + rkey: if the name of the field to match on in different in the ``other`` table. + numba: the matching algorithm will go much faster using numba. However, you can + set this to False if you do not have numba installed. + """ from akimbo.io import join out = join( diff --git a/src/akimbo/pandas.py b/src/akimbo/pandas.py index 61ea563..7946375 100644 --- a/src/akimbo/pandas.py +++ b/src/akimbo/pandas.py @@ -11,7 +11,7 @@ @pd.api.extensions.register_series_accessor("ak") @pd.api.extensions.register_dataframe_accessor("ak") class PandasAwkwardAccessor(Accessor): - """Perhaps awkward operations on pandas data + """Perform awkward operations on pandas data Nested structures are handled using arrow as the storage backend. If you use pandas object columns diff --git a/tests/test_io.py b/tests/test_io.py index bf79985..38557d5 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -46,7 +46,7 @@ def test_read_parquet(m): # noqa (m is a fixture) df.to_parquet(fn) out = akimbo.read_parquet(fn) - meta = akimbo.metadata_from_parquet(fn) + meta = akimbo.get_parquet_schema(fn) assert meta["columns"] == ["a.list.element"] # parquet column naming convention assert out.columns == ["a"] assert out.a.to_list() == data