Add --order-by support to query-datasets command-line

Do not include it in butler associate. This required a small rewrite of the table accumulator to use a dict rather than a set.
lsst · Sep 4, 2024 · afba401 · afba401
1 parent c324dac
commit afba401
Show file tree

Hide file tree

Showing 8 changed files with 71 additions and 31 deletions.
diff --git a/python/lsst/daf/butler/cli/cmd/commands.py b/python/lsst/daf/butler/cli/cmd/commands.py
@@ -78,7 +78,7 @@
 @click.command(cls=ButlerCommand, short_help="Add existing datasets to a tagged collection.")
 @repo_argument(required=True)
 @collection_argument(help="COLLECTION is the collection the datasets should be associated with.")
-@query_datasets_options(repo=False, showUri=False, useArguments=False, default_limit=0)
+@query_datasets_options(repo=False, showUri=False, useArguments=False, default_limit=0, use_order_by=False)
 @options_file_option()
 def associate(**kwargs: Any) -> None:
     """Add existing datasets to a tagged collection; searches for datasets with

diff --git a/python/lsst/daf/butler/cli/opt/optionGroups.py b/python/lsst/daf/butler/cli/opt/optionGroups.py
@@ -33,7 +33,7 @@
 
 from ..utils import OptionGroup, unwrap, where_help
 from .arguments import glob_argument, repo_argument
-from .options import collections_option, dataset_type_option, limit_option, where_option
+from .options import collections_option, dataset_type_option, limit_option, order_by_option, where_option
 
 
 class query_datasets_options(OptionGroup):  # noqa: N801
@@ -49,10 +49,17 @@ class query_datasets_options(OptionGroup):  # noqa: N801
         Whether this is an argument or an option.
     default_limit : `int`
         The default value to use for the limit parameter.
+    use_order_by : `bool`
+        Whether to include an order_by option.
     """
 
     def __init__(
-        self, repo: bool = True, showUri: bool = True, useArguments: bool = True, default_limit: int = -10_000
+        self,
+        repo: bool = True,
+        showUri: bool = True,
+        useArguments: bool = True,
+        default_limit: int = -10_000,
+        use_order_by: bool = True,
     ) -> None:
         self.decorators = []
         if repo:
@@ -100,6 +107,8 @@ def __init__(
                 ),
             ]
         )
+        if use_order_by:
+            self.decorators.append(order_by_option())
         if showUri:
             self.decorators.append(
                 click.option("--show-uri", is_flag=True, help="Show the dataset URI in results.")

diff --git a/python/lsst/daf/butler/script/_associate.py b/python/lsst/daf/butler/script/_associate.py
@@ -74,6 +74,7 @@ def associate(
         where=where,
         find_first=find_first,
         limit=limit,
+        order_by=(),
         show_uri=False,
         repo=None,
     )

diff --git a/python/lsst/daf/butler/script/queryDatasets.py b/python/lsst/daf/butler/script/queryDatasets.py
@@ -26,7 +26,6 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 from __future__ import annotations
 
-import dataclasses
 import logging
 from collections import defaultdict
 from collections.abc import Iterable, Iterator
@@ -46,21 +45,15 @@
 _LOG = logging.getLogger(__name__)
 
 
-@dataclasses.dataclass(frozen=True)
-class _RefInfo:
-    datasetRef: DatasetRef
-    uri: str | None
-
-
 class _Table:
     """Aggregates rows for a single dataset type, and creates an astropy table
     with the aggregated data. Eliminates duplicate rows.
     """
 
-    datasetRefs: set[_RefInfo]
+    datasetRefs: dict[DatasetRef, str | None]
 
     def __init__(self) -> None:
-        self.datasetRefs = set()
+        self.datasetRefs = {}
 
     def add(self, datasetRef: DatasetRef, uri: ResourcePath | None = None) -> None:
         """Add a row of information to the table.
@@ -76,15 +69,18 @@ def add(self, datasetRef: DatasetRef, uri: ResourcePath | None = None) -> None:
             The URI to show as a file location in the table, by default `None`.
         """
         uri_str = str(uri) if uri else None
-        self.datasetRefs.add(_RefInfo(datasetRef, uri_str))
+        # Use a dict to retain ordering.
+        self.datasetRefs[datasetRef] = uri_str
 
-    def getAstropyTable(self, datasetTypeName: str) -> AstropyTable:
+    def getAstropyTable(self, datasetTypeName: str, sort: bool = True) -> AstropyTable:
         """Get the table as an astropy table.
 
         Parameters
         ----------
         datasetTypeName : `str`
             The dataset type name to show in the ``type`` column of the table.
+        sort : `bool`, optional
+            If `True` the table will be sorted.
 
         Returns
         -------
@@ -96,11 +92,8 @@ def getAstropyTable(self, datasetTypeName: str) -> AstropyTable:
         if not self.datasetRefs:
             raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}")
 
-        refInfo = next(iter(self.datasetRefs))
-        dimensions = [
-            refInfo.datasetRef.dataId.universe.dimensions[k]
-            for k in refInfo.datasetRef.dataId.dimensions.data_coordinate_keys
-        ]
+        ref = next(iter(self.datasetRefs))
+        dimensions = [ref.dataId.universe.dimensions[k] for k in ref.dataId.dimensions.data_coordinate_keys]
         columnNames = ["type", "run", "id", *[str(item) for item in dimensions]]
 
         # Need to hint the column types for numbers since the per-row
@@ -111,26 +104,29 @@ def getAstropyTable(self, datasetTypeName: str) -> AstropyTable:
             None,
             None,
             str,
-            *[typeMap.get(type(value)) for value in refInfo.datasetRef.dataId.full_values],
+            *[typeMap.get(type(value)) for value in ref.dataId.full_values],
         ]
-        if refInfo.uri:
+        if self.datasetRefs[ref]:
             columnNames.append("URI")
             columnTypes.append(None)
 
         rows = []
-        for refInfo in self.datasetRefs:
+        for ref, uri in self.datasetRefs.items():
             row = [
                 datasetTypeName,
-                refInfo.datasetRef.run,
-                str(refInfo.datasetRef.id),
-                *refInfo.datasetRef.dataId.full_values,
+                ref.run,
+                str(ref.id),
+                *ref.dataId.full_values,
             ]
-            if refInfo.uri:
-                row.append(refInfo.uri)
+            if uri:
+                row.append(uri)
             rows.append(row)
 
         dataset_table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes)
-        return sortAstropyTable(dataset_table, dimensions, ["type", "run"])
+        if sort:
+            return sortAstropyTable(dataset_table, dimensions, ["type", "run"])
+        else:
+            return dataset_table
 
 
 class QueryDatasets:
@@ -160,6 +156,11 @@ class QueryDatasets:
         Limit the number of results to be returned. A value of 0 means
         unlimited. A negative value is used to specify a cap where a warning
         is issued if that cap is hit.
+    order_by : `tuple` of `str`
+        Dimensions to use for sorting results. If no ordering is given the
+        results of ``limit`` are undefined and default sorting of the resulting
+        datasets will be applied. It is an error if the requested ordering
+        is inconsistent with the dimensions of the dataset type being queried.
     repo : `str` or `None`
         URI to the location of the repo or URI to a config file describing the
         repo and its location. One of `repo` and `butler` must be `None` and
@@ -177,6 +178,7 @@ def __init__(
         find_first: bool,
         show_uri: bool,
         limit: int = 0,
+        order_by: tuple[str, ...] = (),
         repo: str | None = None,
         butler: Butler | None = None,
     ):
@@ -191,6 +193,7 @@ def __init__(
         self._where = where
         self._find_first = find_first
         self._limit = limit
+        self._order_by = order_by
 
     def getTables(self) -> list[AstropyTable]:
         """Get the datasets as a list of astropy tables.
@@ -212,7 +215,12 @@ def getTables(self) -> list[AstropyTable]:
                 for name, uri in uris.componentURIs.items():
                     tables[ref.datasetType.componentTypeName(name)].add(ref, uri)
 
-        return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()]
+        # Sort if we haven't been told to enforce an order.
+        sort_table = not bool(self._order_by)
+        return [
+            table.getAstropyTable(datasetTypeName, sort=sort_table)
+            for datasetTypeName, table in tables.items()
+        ]
 
     # @profile
     def getDatasets(self) -> Iterator[DatasetRef]:
@@ -247,6 +255,7 @@ def getDatasets(self) -> Iterator[DatasetRef]:
             _LOG.info("The given dataset type, %s, is not known to this butler.", datasetTypes)
         else:
             _LOG.info("Processing %d dataset type%s", n_dataset_types, "" if n_dataset_types == 1 else "s")
+        _LOG.warning("Order by: %s", self._order_by)
 
         # Accumulate over dataset types.
         limit = self._limit
@@ -267,6 +276,7 @@ def getDatasets(self) -> Iterator[DatasetRef]:
                 collections=query_collections,
                 find_first=self._find_first,
                 with_dimension_records=True,
+                order_by=self._order_by,
                 **kwargs,
             )
             if not unlimited:

diff --git a/python/lsst/daf/butler/script/retrieveArtifacts.py b/python/lsst/daf/butler/script/retrieveArtifacts.py
@@ -49,6 +49,7 @@ def retrieveArtifacts(
     where: str,
     find_first: bool,
     limit: int,
+    order_by: tuple[str, ...],
     transfer: str,
     preserve_path: bool,
     clobber: bool,
@@ -75,6 +76,11 @@ def retrieveArtifacts(
         Limit the number of results to be returned. A value of 0 means
         unlimited. A negative value is used to specify a cap where a warning
         is issued if that cap is hit.
+    order_by : `tuple` of `str`
+        Dimensions to use for sorting results. If no ordering is given the
+        results of ``limit`` are undefined and default sorting of the resulting
+        datasets will be applied. It is an error if the requested ordering
+        is inconsistent with the dimensions of the dataset type being queried.
     transfer : `str`
         Transfer mode to use when placing artifacts in the destination.
     preserve_path : `bool`
@@ -102,6 +108,7 @@ def retrieveArtifacts(
         where=where,
         find_first=find_first,
         limit=limit,
+        order_by=order_by,
         show_uri=False,
     )
     refs = list(query.getDatasets())

diff --git a/python/lsst/daf/butler/script/transferDatasets.py b/python/lsst/daf/butler/script/transferDatasets.py
@@ -44,6 +44,7 @@ def transferDatasets(
     where: str,
     find_first: bool,
     limit: int,
+    order_by: tuple[str, ...],
     transfer: str,
     register_dataset_types: bool,
     transfer_dimensions: bool = True,
@@ -69,6 +70,11 @@ def transferDatasets(
         Limit the number of results to be returned. A value of 0 means
         unlimited. A negative value is used to specify a cap where a warning
         is issued if that cap is hit.
+    order_by : `tuple` of `str`
+        Dimensions to use for sorting results. If no ordering is given the
+        results of ``limit`` are undefined and default sorting of the resulting
+        datasets will be applied. It is an error if the requested ordering
+        is inconsistent with the dimensions of the dataset type being queried.
     transfer : `str`
         Transfer mode to use when placing artifacts in the destination.
     register_dataset_types : `bool`
@@ -91,6 +97,7 @@ def transferDatasets(
         where=where,
         find_first=find_first,
         limit=limit,
+        order_by=order_by,
         show_uri=False,
     )
     # Place results in a set to remove duplicates (which should not exist

diff --git a/tests/test_cliCmdAssociate.py b/tests/test_cliCmdAssociate.py
@@ -58,6 +58,7 @@ def test_defaults(self, mockAssociate):
             collections=(),
             where="",
             find_first=False,
+            limit=0,
         )
 
     @patch("lsst.daf.butler.script.associate")
@@ -76,6 +77,8 @@ def test_values(self, mockAssociate):
                 "--where",
                 "'a=b'",
                 "--find-first",
+                "--limit",
+                "-5000",
             ],
         )
         self.assertEqual(result.exit_code, 0, clickResultMsg(result))
@@ -86,6 +89,7 @@ def test_values(self, mockAssociate):
             collections=("myCollection", "otherCollection"),
             where="'a=b'",
             find_first=True,
+            limit=-5000,
         )
 
 

diff --git a/tests/test_cliCmdQueryDatasets.py b/tests/test_cliCmdQueryDatasets.py
@@ -147,8 +147,10 @@ class QueryDatasetsTest(unittest.TestCase, ButlerTestHelper):
     storageClassFactory = StorageClassFactory()
 
     @staticmethod
-    def _queryDatasets(repo, glob=(), collections=(), where="", find_first=False, show_uri=False):
-        return script.QueryDatasets(glob, collections, where, find_first, show_uri, repo=repo).getTables()
+    def _queryDatasets(repo, glob=(), collections=(), where="", find_first=False, show_uri=False, limit=0):
+        return script.QueryDatasets(
+            glob, collections, where=where, find_first=find_first, show_uri=show_uri, limit=limit, repo=repo
+        ).getTables()
 
     def setUp(self):
         self.testdir = makeTestTempDir(TESTDIR)