ENH: Add module with extra dependencies to create a generic xarray dataset from a packet data file

medley56 · medley56 · commit d651445a3149 · 2025-01-29T19:37:21.000-07:00
diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
@@ -46,7 +46,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install . pytest pytest-randomly
+          pip install ".[xarray]" pytest pytest-randomly
       
       - name: Testing
         run: |
diff --git a/docs/source/changelog.md b/docs/source/changelog.md
@@ -25,6 +25,7 @@ Release notes for the `space_packet_parser` library
     ``definition.packet_generator(data, combine_segmented_packets=True, secondary_header_bytes=4)``
 - Add a command line interface (spp) to enable quick and easy access to
   some common tasks and utilities.
+- Add function to directly create an `xarray.DataSet` from a packet file and XTCE definition.
 
 ### v5.0.1 (released)
 - BUGFIX: Allow raw_value representation for enums with falsy raw values. Previously these defaulted to the enum label.
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,9 @@ python = ">=3.9"
 lxml = ">=4.8.0"
 click = "^8.0"
 rich = "^13.0"
+# Extras dependencies
+xarray = { version = ">2024.0.0", optional = true }
+numpy = { version = "^2.0.0", optional = true }
 
 [tool.poetry.group.dev.dependencies]
 pycodestyle = "*"
@@ -60,6 +63,9 @@ optional = true
 matplotlib = ">=3.4"
 memory-profiler = "^0.61.0"
 
+[tool.poetry.extras]
+xarray = ["xarray", "numpy"]
+
 [tool.poetry.scripts]
 spp = "space_packet_parser.cli:spp"
 
diff --git a/space_packet_parser/extras/__init__.py b/space_packet_parser/extras/__init__.py
diff --git a/space_packet_parser/extras/xarr.py b/space_packet_parser/extras/xarr.py
@@ -0,0 +1,196 @@
+"""Extras package that supports generating an `xarray.DataSet` directly"""
+# Extras import first since it might fail
+try:
+    import xarray as xr
+    import numpy as np
+except ImportError as ie:
+    raise ImportError(
+        "Failed to import dependencies for xarray extra. Did you install the [xarray] extras package?"
+    ) from ie
+# Standard
+import collections
+from typing import Optional, Union, Iterable
+from pathlib import Path
+# Local
+from space_packet_parser import definitions, parameters, encodings
+
+
+def _get_minimum_numpy_datatype(  # noqa: PLR0912 - Too many branches pylint: disable=too-many-branches
+        name: str,
+        definition: definitions.XtcePacketDefinition,
+        use_raw_value: bool = False
+) -> Optional[str]:
+    """
+    Get the minimum datatype for a given variable.
+
+    Parameters
+    ----------
+    name : str
+        The variable name.
+    definition : definitions.XtcePacketDefinition
+        The XTCE packet definition. Used to examine data types to infer their niminal numpy representation.
+    use_raw_value : bool
+        Default False. If True, uses the data type of the raw value for each parameter.
+
+    Returns
+    -------
+    datatype : Optional[str]
+        The minimum numpy dtype for the parameter.
+        Returns None to indicate that numpy should use default dtype inference.
+    """
+    data_encoding = definition.named_parameters[name].parameter_type.encoding
+
+    datatype = None
+
+    if use_raw_value and isinstance(
+        definition.named_parameters[name].parameter_type,
+        parameters.EnumeratedParameterType,
+    ):
+        # We don't have a way of knowing what is enumerated,
+        # let numpy infer the datatype
+        return None
+
+    if isinstance(data_encoding, encodings.NumericDataEncoding):
+        if not use_raw_value and (
+            data_encoding.context_calibrators is not None
+            or data_encoding.default_calibrator is not None
+        ):
+            # If there are calibrators, we need to default to None and
+            # let numpy infer the datatype
+            return None
+
+        nbits = data_encoding.size_in_bits
+        if isinstance(data_encoding, encodings.IntegerDataEncoding):
+            datatype = "int"
+            if data_encoding.encoding == "unsigned":
+                datatype = "uint"
+            if nbits <= 8:
+                datatype += "8"
+            elif nbits <= 16:
+                datatype += "16"
+            elif nbits <= 32:
+                datatype += "32"
+            else:
+                datatype += "64"
+        elif isinstance(data_encoding, encodings.FloatDataEncoding):
+            datatype = "float"
+            if nbits == 32:
+                datatype += "32"
+            else:
+                datatype += "64"
+    elif isinstance(data_encoding, encodings.BinaryDataEncoding):
+        # TODO: Binary string representation right now, do we want bytes or
+        #  something else like the new StringDType instead?
+        datatype = "str"
+    elif isinstance(data_encoding, encodings.StringDataEncoding):
+        # TODO: Use the new StringDType instead?
+        datatype = "str"
+    else:
+        raise ValueError(f"Unsupported data encoding: {data_encoding}")
+
+    return datatype
+
+
+def create_dataset(
+        packet_files: Union[str, Path, Iterable[Union[str, Path]]],
+        xtce_packet_definition: Union[str, Path, definitions.XtcePacketDefinition],
+        use_raw_values: bool = False,
+        packet_generator_kwargs: Optional[dict] = None
+):
+    """Create an xarray dataset from an iterable of parsed packet objects
+
+    # TODO: Filter by APID to handle muxed streams?
+
+    Notes
+    -----
+    This function only handles packet definitions with the same variable structure
+    across all packets with the same ApId. For example, this cannot be used for polymorphic
+    packets whose structure changes based on previously parsed values.
+
+    Parameters
+    ----------
+    packet_files : Union[str, Path, Iterable[Union[str, Path]]]
+        Packet files
+    xtce_packet_definition : Union[str, Path, XtcePacketDefinition]
+        Packet definition for parsing the packet data
+    use_raw_values: bool
+        Default False. If True, saves parameter raw values to the resulting DataSet.
+        e.g. enumerated lookups will be saved as their encoded integer values.
+    packet_generator_kwargs : Optional[dict]
+        Keyword arguments passed to `XtcePacketDefinition.packet_generator()`
+
+    Returns
+    -------
+    : xarray.DataSet
+        DataSet object parsed from the iterable of packets.
+    """
+    packet_generator_kwargs = packet_generator_kwargs or {}
+
+    if not isinstance(xtce_packet_definition, definitions.XtcePacketDefinition):
+        xtce_packet_definition = definitions.XtcePacketDefinition(xtce_packet_definition)
+
+    if isinstance(packet_files, (str, Path)):
+        packet_files = [packet_files]
+
+    # Set up containers to store our data
+    # We are getting a packet file that may contain multiple apids
+    # Each apid has consistent data fields, so we want to create a
+    # dataset per apid.
+    # {apid1: dataset1, apid2: dataset2, ...}
+    data_dict: dict[int, dict] = {}
+    # Also keep track of the datatype mapping for each field
+    datatype_mapping: dict[int, dict] = {}
+    # Keep track of which variables (keys) are in the dataset
+    variable_mapping: dict[int, set] = {}
+
+    for packet_file in packet_files:
+        with open(packet_file, "rb") as f:
+            packet_generator = list(xtce_packet_definition.packet_generator(f, **packet_generator_kwargs))
+
+        for packet in packet_generator:
+            apid = list(packet.values())[3]  # Allows for unrestricted naming of header fields
+            if apid not in data_dict:
+                # This is the first packet for this APID
+                data_dict[apid] = collections.defaultdict(list)
+                datatype_mapping[apid] = {}
+                variable_mapping[apid] = packet.keys()
+
+            if variable_mapping[apid] != packet.keys():
+                raise ValueError(
+                    f"Packet fields do not match for APID {apid}. This could be "
+                    f"due to a conditional (polymorphic) packet definition in the XTCE, while this "
+                    f"function currently only supports flat packet definitions."
+                    f"\nExpected: {variable_mapping[apid]},\ngot: {list(packet.keys())}"
+                )
+
+            # TODO: Do we want to give an option to remove the header content?
+            #  Headers are generally useful, so what is the use case?
+
+            for key, value in packet.items():
+                if use_raw_values:
+                    # Use the derived value if it exists, otherwise use the raw value
+                    val = value.raw_value
+                else:
+                    val = value
+
+                data_dict[apid][key].append(val)
+                if key not in datatype_mapping[apid]:
+                    # Add this datatype to the mapping
+                    datatype_mapping[apid][key] = _get_minimum_numpy_datatype(
+                        key, xtce_packet_definition, use_raw_value=use_raw_values
+                    )
+
+    # Turn the dict into an xarray dataset
+    dataset_by_apid = {}
+
+    for apid, data in data_dict.items():
+        ds = xr.Dataset(
+            data_vars={
+                key: (["packet"], np.asarray(list_of_values, dtype=datatype_mapping[apid][key]))
+                for key, list_of_values in data.items()
+            }
+        )
+
+        dataset_by_apid[apid] = ds
+
+    return dataset_by_apid
diff --git a/space_packet_parser/packets.py b/space_packet_parser/packets.py
@@ -217,8 +217,8 @@ def create_ccsds_packet(data=b"\x00",
                         secondary_header_flag=0,
                         apid=2047,  # 2047 is defined as a fill packet in the CCSDS spec
                         sequence_flags=SequenceFlags.UNSEGMENTED,
-                        sequence_count=0):
-    """Create a binary CCSDS packet.
+                        sequence_count=0) -> RawPacketData:
+    """Create a binary CCSDS packet from input values.
 
     Pack the header fields into the proper bit locations and append the data bytes.
 
@@ -238,6 +238,11 @@ def create_ccsds_packet(data=b"\x00",
         CCSDS Packet Sequence Flags (2 bits)
     sequence_count : int
         CCSDS Packet Sequence Count (14 bits)
+
+    Returns
+    -------
+    : RawPacketData
+        Resulting binary packet
     """
     if version_number < 0 or version_number > 7:  # 3 bits
         raise ValueError("version_number must be between 0 and 7")
diff --git a/tests/integration/test_xarr.py b/tests/integration/test_xarr.py
@@ -0,0 +1,22 @@
+"""Test creating an xarray dataset from CCSDS packets"""
+from space_packet_parser.extras.xarr import create_dataset
+
+
+def test_create_xarray_dataset(jpss_test_data_dir):
+    """Test creating an xarray dataset from JPSS geolocation packets"""
+    packet_file = jpss_test_data_dir / "J01_G011_LZ_2021-04-09T00-00-00Z_V01.DAT1"
+    definition_file = jpss_test_data_dir / "jpss1_geolocation_xtce_v1.xml"
+    ds = create_dataset(packet_file, definition_file)
+    assert list(ds.keys()) == [11]
+    assert len(ds[11]) == 27
+    assert len(ds[11]["VERSION"]) == 7200
+
+
+def test_create_xarray_dataset_multiple_files(jpss_test_data_dir):
+    """Testing parsing multiple files of packets"""
+    packet_file = jpss_test_data_dir / "J01_G011_LZ_2021-04-09T00-00-00Z_V01.DAT1"
+    definition_file = jpss_test_data_dir / "jpss1_geolocation_xtce_v1.xml"
+    ds = create_dataset([packet_file, packet_file], definition_file)
+    assert list(ds.keys()) == [11]
+    assert len(ds[11]) == 27
+    assert len(ds[11]["VERSION"]) == 14400