openPMD · franzpoeschel · Jul 9, 2024 · Jul 9, 2024 · Jul 9, 2024 · Jul 9, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -406,6 +406,7 @@ set(CORE_SOURCE
         src/auxiliary/Date.cpp
         src/auxiliary/Filesystem.cpp
         src/auxiliary/JSON.cpp
+        src/auxiliary/JSONMatcher.cpp
         src/auxiliary/Mpi.cpp
         src/backend/Attributable.cpp
         src/backend/BaseRecordComponent.cpp

diff --git a/docs/source/details/backendconfig.rst b/docs/source/details/backendconfig.rst
@@ -287,3 +287,76 @@ Explanation of the single keys:
   In "template" mode, only the dataset metadata (type, extent and attributes) are stored and no chunks can be written or read (i.e. write/read operations will be skipped).
 * ``json.attribute.mode`` / ``toml.attribute.mode``: One of ``"long"`` (default in openPMD 1.*) or ``"short"`` (default in openPMD 2.* and generally in TOML).
   The long format explicitly encodes the attribute type in the dataset on disk, the short format only writes the actual attribute as a JSON/TOML value, requiring readers to recover the type.
+
+Dataset-specific configuration
+------------------------------
+
+Some configuration options should be applicable on a per-dataset basis.
+Most dataset-specific configuration options supported by the openPMD-api are additionally backend-specific, being format-specific serialization instructions such as compression or chunking.
+Such dataset-specific and backend-specific configuration is hence specified under the key path ``<backend>.dataset``, e.g.:
+
+.. code-block:: json
+
+  {
+    "adios2": {
+      "dataset": {
+        "operators": []
+      }
+    },
+    "hdf5": {
+      "dataset": {
+        "chunking": "auto"
+      }
+    }
+  }
+
+Dataset-specific configuration options can be configured in multiple ways:
+
+As part of the general JSON/TOML configuration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In the simplest case, the dataset configuration is specified without any extra steps as part of the JSON/TOML configuration that is used to initialize the openPMD Series as part of the ``Series`` constructor. This does not allow specifying different configurations per dataset, but sets the default configuration for all datasets.
+
+As a separate JSON/TOML configuration during dataset initialization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Similarly to the ``Series`` constructor, the ``Dataset`` constructor optionally receives a JSON/TOML configuration, used for setting options specifically only for those datasets initialized with this ``Dataset`` specification. The default given in the ``Series`` constructor will be overridden.
+
+This is the preferred way for configuring dataset-specific options that are *not* backend-specific (currently only ``{"resizable": true}``).
+
+By pattern-matching the dataset names
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The above approach has the disadvantage that it has to be supported explicitly at the level of the downstream application, e.g. a simulation or data reader. As an alternative, the the backend-specific dataset configuration under ``<backend>.dataset`` can also be given as a list of alternatives that are matched against the dataset name in sequence, e.g. ``hdf5.dataset = [<pattern_1>, <pattern_2>, ...]``.
+
+Each such pattern ``<pattern_i>`` is a JSON object with key ``cfg`` and optional key ``select``: ``{"select": <regex>, "cfg": <cfg>}``.
+
+In here, ``<regex>`` is a regex or a list of regexes, of type egrep as defined by the `C++ standard library <https://en.cppreference.com/w/cpp/regex/basic_regex/constants>`__.
+``<cfg>`` is a configuration that will be forwarded as a "regular" dataset configuration to the backend.
+
+.. note::
+
+  To match lists of regular expressions ``select = [REGEX_1, REGEX_2, ..., REGEX_n]``, the list is internally transformed into a single regular expression ``($^)|(REGEX_1)|(REGEX_2)|...|(REGEX_n)``.
+
+In a configuration such as ``hdf5.dataset = [<pattern_1>, <pattern_2>, ...]``, the single patterns will be processed in top-down manner, selecting the first matching pattern found in the list.
+The specified regexes will be matched against the openPMD dataset path either within the Iteration (e.g. ``meshes/E/x`` or ``particles/.*/position/.*``) or within the Series (e.g. ``/data/1/meshes/E/x`` or ``/data/.*/particles/.*/position/.*``), considering full matches only.
+
+.. note::
+
+  The dataset name is determined by the result of ``attributable.myPath().openPMDPath()`` where ``attributable`` is an object in the openPMD hierarchy.
+
+.. note::
+
+  To match against the path within the containing Iteration or within the containing Series, the specified regular expression is internally transformed into ``(/data/[0-9]+/)?(REGEX)`` where ``REGEX`` is the specified pattern, and then matched against the full dataset path.
+
+The **default configuration** is specified by omitting the ``select`` key.
+Specifying more than one default is an error.
+If no pattern matches a dataset, the default configuration is chosen if specified, or an empty JSON object ``{}`` otherwise.
+
+A full example:
+
+.. literalinclude:: openpmd_extended_config.toml
+   :language: toml
+
+.. literalinclude:: openpmd_extended_config.json
+   :language: json
diff --git a/docs/source/details/openpmd_extended_config.json b/docs/source/details/openpmd_extended_config.json
@@ -0,0 +1,62 @@
+{
+  "adios2": {
+    "engine": {
+      "parameters": {
+        "Profile": "On"
+      }
+    },
+    "dataset": [
+      {
+        "cfg": {
+          "operators": [
+            {
+              "type": "blosc",
+              "parameters": {
+                "clevel": "1",
+                "doshuffle": "BLOSC_BITSHUFFLE"
+              }
+            }
+          ]
+        }
+      },
+      {
+        "select": [
+          ".*positionOffset.*",
+          ".*particlePatches.*"
+        ],
+        "cfg": {
+          "operators": []
+        }
+      }
+    ]
+  },
+  "hdf5": {
+    "independent_stores": false,
+    "dataset": [
+      {
+        "cfg": {
+          "chunks": "auto"
+        }
+      },
+      {
+        "select": [
+          "/data/1/particles/e/.*",
+          "/data/2/particles/e/.*"
+        ],
+        "cfg": {
+          "chunks": [
+            5
+          ]
+        }
+      },
+      {
+        "select": "particles/e/.*",
+        "cfg": {
+          "chunks": [
+            10
+          ]
+        }
+      }
+    ]
+  }
+}
diff --git a/docs/source/details/openpmd_extended_config.toml b/docs/source/details/openpmd_extended_config.toml
@@ -0,0 +1,44 @@
+
+# ADIOS2 config
+
+[adios2.engine.parameters]
+Profile = "On"
+
+# default configuration
+[[adios2.dataset]]
+# nested list as ADIOS2 can add multiple operators to a single dataset
+[[adios2.dataset.cfg.operators]]
+type = "blosc"
+parameters.doshuffle = "BLOSC_BITSHUFFLE"
+parameters.clevel = "1"
+
+# dataset-specific configuration to exclude some datasets
+# from applying operators.
+[[adios2.dataset]]
+select = [".*positionOffset.*", ".*particlePatches.*"]
+cfg.operators = []
+
+# Now HDF5
+
+[hdf5]
+independent_stores = false
+
+# default configuration
+# The position of the default configuration does not matter, but there must
+# be only one single default configuration.
+[[hdf5.dataset]]
+cfg.chunks = "auto"
+
+# Dataset-specific configuration that specifies full paths,
+# i.e. including the path to the Iteration.
+# The non-default configurations are matched in top-down order,
+# so the order is relevant.
+[[hdf5.dataset]]
+select = ["/data/1/particles/e/.*", "/data/2/particles/e/.*"]
+cfg.chunks = [5]
+
+# dataset-specific configuration that specifies only the path
+# within the Iteration
+[[hdf5.dataset]]
+select = "particles/e/.*"
+cfg.chunks = [10]
diff --git a/examples/13_write_dynamic_configuration.cpp b/examples/13_write_dynamic_configuration.cpp
@@ -10,13 +10,15 @@ using namespace openPMD;
 
 int main()
 {
-    if (!getVariants()["adios2"])
+    if (!getVariants()["hdf5"])
     {
         // Example configuration below selects the ADIOS2 backend
         return 0;
     }
 
     using position_t = double;
+
+#if !__NVCOMPILER // see https://github.com/ToruNiina/toml11/issues/205
     /*
      * This example demonstrates how to use JSON/TOML-based dynamic
      * configuration for openPMD.
@@ -34,7 +36,7 @@ int main()
 # be passed by adding an at-sign `@` in front of the path
 # The format will then be recognized by filename extension, i.e. .json or .toml
 
-backend = "adios2"
+backend = "hdf5"
 iteration_encoding = "group_based"
 # The following is only relevant in read mode
 defer_iteration_parsing = true
@@ -57,13 +59,104 @@ parameters.clevel = 5
 # type = "some other parameter"
 # # ...
 
-[hdf5.dataset]
-chunks = "auto"
+# Sometimes, dataset configurations should not affect all datasets, but only
+# specific ones, e.g. only particle data.
+# Dataset configurations can be given as a list, here at the example of HDF5.
+# In such lists, each entry is an object with two keys:
+#
+# 1. 'cfg': Mandatory key, this is the actual dataset configuration.
+# 2. 'select': A Regex or a list of Regexes to match against the dataset name.
+#
+# This makes it possible to give dataset-specific configurations.
+# The dataset name is the same as returned
+# by `Attributable::myPath().openPMDPath()`.
+# The regex must match against either the full path (e.g. "/data/1/meshes/E/x")
+# or against the path within the iteration (e.g. "meshes/E/x").
+
+# Example:
+# Let HDF5 datasets be automatically chunked by default
+[[hdf5.dataset]]
+cfg.chunks = "auto"
+
+# For particles, we can specify the chunking explicitly
+[[hdf5.dataset]]
+# Multiple selection regexes can be given as a list.
+# They will be fused into a single regex '($^)|(regex1)|(regex2)|(regex3)|...'.
+select = ["/data/1/particles/e/.*", "/data/2/particles/e/.*"]
+cfg.chunks = [5]
+
+# Selecting a match works top-down, the order of list entries is important.
+[[hdf5.dataset]]
+# Specifying only a single regex.
+# The regex can match against the full dataset path
+# or against the path within the Iteration.
+# Capitalization is irrelevant.
+select = "particles/e/.*"
+CFG.CHUNKS = [10]
 )END";
+#else
+    /*
+     * This is the same configuration in JSON. We need this in deprecated
+     * NVHPC-compilers due to problems that those compilers have with the
+     * toruniina::toml11 library.
+     */
+    std::string const defaults = R"(
+{
+  "backend": "hdf5",
+  "defer_iteration_parsing": true,
+  "iteration_encoding": "group_based",
+
+  "adios2": {
+    "engine": {
+      "type": "bp4"
+    },
+    "dataset": {
+      "operators": [
+        {
+          "parameters": {
+            "clevel": 5
+          },
+          "type": "zlib"
+        }
+      ]
+    }
+  },
+
+  "hdf5": {
+    "dataset": [
+      {
+        "cfg": {
+          "chunks": "auto"
+        }
+      },
+      {
+        "select": [
+          "/data/1/particles/e/.*",
+          "/data/2/particles/e/.*"
+        ],
+        "cfg": {
+          "chunks": [
+            5
+          ]
+        }
+      },
+      {
+        "select": "particles/e/.*",
+        "CFG": {
+          "CHUNKS": [
+            10
+          ]
+        }
+      }
+    ]
+  }
+}
+)";
+#endif
 
     // open file for writing
     Series series =
-        Series("../samples/dynamicConfig.bp", Access::CREATE, defaults);
+        Series("../samples/dynamicConfig.h5", Access::CREATE, defaults);
 
     Datatype datatype = determineDatatype<position_t>();
     constexpr unsigned long length = 10ul;
@@ -93,18 +186,14 @@ chunks = "auto"
 
         /*
          * We want different compression settings for this dataset, so we pass
-         * a dataset-specific configuration.
+         * a dataset-specific configuration. This will override any definition
+         * specified above.
          * Also showcase how to define an resizable dataset.
          * This time in JSON.
          */
         std::string const differentCompressionSettings = R"END(
 {
   "resizable": true,
-  "adios1": {
-    "dataset": {
-      "transform": "blosc:compressor=zlib,shuffle=bit,lvl=1;nometa"
-    }
-  },
   "adios2": {
     "dataset": {
       "operators": [