Merge pull request #539 from zooniverse/feature/extract-acrsss-versions

Start using packaging.version for version parsing
zooniverse · Aug 10, 2022 · 73bd8f4 · 73bd8f4
2 parents 026d2de + fafae82
commit 73bd8f4
Show file tree

Hide file tree

Showing 12 changed files with 527 additions and 157 deletions.
diff --git a/README.md b/README.md
@@ -19,14 +19,14 @@ Instal the latest stable release:
 pip install panoptes_aggregation
 ```
 
-Or for development or testing, you can install the development version directly from GitHub:
+Upgrade and existing installation:
 ```bash
-pip install -U git+git://github.com/zooniverse/aggregation-for-caesar.git
+pip install -U panoptes_aggregation
 ```
 
-Upgrade and existing installation:
+Or for development or testing, you can install the latest development version directly from GitHub:
 ```bash
-pip install -U panoptes_aggregation
+pip install -U git+https://github.com/zooniverse/aggregation-for-caesar.git
 ```
 
 #### Install the Graphical User Interface (GUI)
@@ -35,6 +35,11 @@ If you would like to use the GUI instead of the command line install the package
 pip install "panoptes_aggregation[gui]"
 ```
 
+Or for the latest development build from GitHub:
+```bash
+pip install -U git+https://github.com/zooniverse/aggregation-for-caesar.git#egg=panoptes-aggregation[gui]
+```
+
 #### Anaconda build of python
 If your are using the anaconda version of python some of the dependencies should be installed using the `conda` package manager before installing `panoptes_aggregation`:
 ```bash

diff --git a/Scripts.md b/Scripts.md
@@ -41,7 +41,9 @@ Use the command line tool to make configuration `yaml` files that are used to se
 
 ```bash
 usage: panoptes_aggregation config [-h] [-d DIR] [-v VERSION]
-                                   [-m MINOR_VERSION] [-k KEYWORDS] [-vv]
+                                   [--min_version MIN_VERSION]
+                                   [--max_version MAX_VERSION] [-k KEYWORDS]
+                                   [-vv]
                                    workflow_csv workflow_id
 
 Make configuration files for panoptes data extraction and reduction based on a
@@ -60,10 +62,23 @@ Workflow ID and version numbers:
 
   workflow_id           the workflow ID you would like to extract
   -v VERSION, --version VERSION
-                        The major workflow version to extract
-  -m MINOR_VERSION, --minor_version MINOR_VERSION
-                        The minor workflow version used to create the lookup
-                        table for the workflow content
+                        The workflow version to extract. If only a major
+                        version is given (e.g. -v 3) all minor versions will
+                        be extracted at once. If a minor version is provided
+                        (e.g. -v 3.14) only that specific version will be
+                        extracted.
+  --min_version MIN_VERSION
+                        The minimum workflow version to extract (inclusive).
+                        This can be provided as either a major version (e.g.
+                        --min_version 3) or a major version with a minor
+                        version (e.g. --min_version 3.14). If this flag is
+                        provided the --version flag will be ignored.
+  --max_version MAX_VERSION
+                        The maximum workflow version to extract (inclusive).
+                        This can be provided as either a major version (e.g.
+                        --max_version 3) or a major version with a minor
+                        version (e.g. --max_version 3.14). If this flag is
+                        provided the --version flag will be ignored.
 
 Other keywords:
   Additional keywords to be passed into the configuration files
@@ -85,7 +100,7 @@ Other options:
 
 ### Example: Penguin Watch
 ```bash
-panoptes_aggregation config penguin-watch-workflows.csv 6465 -v 52 -m 76
+panoptes_aggregation config penguin-watch-workflows.csv 6465 -v 52.76
 ```
 
 This creates four files:
@@ -103,6 +118,7 @@ Use the command line tool to extract your data into one flat `csv` file for each
 
 ```bash
 usage: panoptes_aggregation extract [-h] [-d DIR] [-o OUTPUT] [-O]
+                                    [-c CPU_COUNT] [-vv]
                                     classification_csv extractor_config
 
 Extract data from panoptes classifications based on the workflow
@@ -125,6 +141,10 @@ What directory and base name should be used for the extractions:
 Other options:
   -O, --order           Arrange the data columns in alphabetical order before
                         saving
+  -c CPU_COUNT, --cpu_count CPU_COUNT
+                        How many cpu cores to use during extraction
+  -vv, --verbose        increase output verbosity
+
 ```
 
 ### Example: Penguin Watch
@@ -165,8 +185,8 @@ This creates two `csv` files (one for each extractor listed in the config file):
 Note: this only works for some task types, see the [documentation](https://aggregation-caesar.zooniverse.org/docs) for a full list of supported task types.
 
 ```bash
-usage: panoptes_aggregation reduce [-h] [-F {first,last,all}] [-O] [-d DIR]
-                                   [-o OUTPUT] [-s]
+usage: panoptes_aggregation reduce [-h] [-F {first,last,all}] [-O]
+                                   [-c CPU_COUNT] [-d DIR] [-o OUTPUT] [-s]
                                    extracted_csv reducer_config
 
 reduce data from panoptes classifications based on the extracted data
@@ -192,6 +212,8 @@ Reducer options:
                         for one subject
   -O, --order           Arrange the data columns in alphabetical order before
                         saving
+  -c CPU_COUNT, --cpu_count CPU_COUNT
+                        How many cpu cores to use during reduction
 ```
 
 ### Example: Penguin Watch

diff --git a/_static/gui_config.png b/_static/gui_config.png
diff --git a/_static/gui_extract.png b/_static/gui_extract.png
diff --git a/_static/gui_reducer.png b/_static/gui_reducer.png
diff --git a/panoptes_aggregation/scripts/aggregation_parser.py b/panoptes_aggregation/scripts/aggregation_parser.py
@@ -33,7 +33,7 @@ def main(args=None):
     )
     config_numbers = config_parser.add_argument_group(
         'Workflow ID and version numbers',
-        'Enter the workflow ID, major version number, and minor version number',
+        'Enter the workflow ID with a version number or version range',
         gooey_options={
             'show_border': False,
             'columns': 1
@@ -84,14 +84,18 @@ def main(args=None):
     config_numbers.add_argument(
         "-v",
         "--version",
-        help="The major workflow version to extract",
-        type=int
+        help="The workflow version to extract.  If only a major version is given (e.g. -v 3) all minor versions will be extracted at once.  If a minor version is provided (e.g. -v 3.14) only that specific version will be extracted.",
+        type=str
     )
     config_numbers.add_argument(
-        "-m",
-        "--minor_version",
-        help="The minor workflow version used to create the lookup table for the workflow content",
-        type=int
+        "--min_version",
+        help="The minimum workflow version to extract (inclusive).  This can be provided as either a major version (e.g. --min_version 3) or a major version with a minor version (e.g. --min_version 3.14).  If this flag is provided the --version flag will be ignored.",
+        type=str
+    )
+    config_numbers.add_argument(
+        "--max_version",
+        help="The maximum workflow version to extract (inclusive).  This can be provided as either a major version (e.g. --max_version 3) or a major version with a minor version (e.g. --max_version 3.14).  If this flag is provided the --version flag will be ignored.",
+        type=str
     )
     config_keywords.add_argument(
         "-k",
@@ -268,7 +272,8 @@ def main(args=None):
             args.workflow_csv,
             args.workflow_id,
             version=args.version,
-            minor_version=args.minor_version,
+            min_version=args.min_version,
+            max_version=args.max_version,
             keywords=args.keywords,
             output_dir=args.dir,
             verbose=args.verbose

diff --git a/panoptes_aggregation/scripts/config_workflow_panoptes.py b/panoptes_aggregation/scripts/config_workflow_panoptes.py
@@ -3,6 +3,8 @@
 import yaml
 import json
 import warnings
+import packaging.version
+import numpy as np
 
 warnings.filterwarnings("ignore", message="numpy.dtype size changed")
 warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
@@ -21,7 +23,8 @@ def config_workflow(
     workflow_csv,
     workflow_id,
     version=None,
-    minor_version=None,
+    min_version=None,
+    max_version=None,
     keywords={},
     output_dir=None,
     verbose=False
@@ -30,32 +33,57 @@ def config_workflow(
     with workflow_csv as workflow_csv_in:
         workflows = pandas.read_csv(workflow_csv_in, encoding='utf-8')
 
-    wdx = (workflows.workflow_id == workflow_id)
-    if version is None:
-        version = workflows[wdx].version.max()
-        if verbose:
-            warnings.warn('No major workflow version was specified, defaulting to version {0}'.format(version))
+    workflows['version_parse'] = np.array([
+        packaging.version.parse('{0}.{1}'.format(v, m))
+        for v, m in zip(workflows.version, workflows.minor_version)
+    ])
 
-    wdx &= (workflows.version == version)
-    if minor_version is None:
-        minor_version = workflows[wdx].minor_version.max()
+    wdx = (workflows.workflow_id == workflow_id)
+    if (version is None) and (min_version is None) and (max_version is None):
+        # no version specified, take the latest version of the workflow
+        version = workflows[wdx].version_parse.max()
+        workflow_version = str(version)
         if verbose:
-            warnings.warn('No minor workflow version was specified, defaulting to version {0}'.format(minor_version))
+            warnings.warn('No workflow version was specified, defaulting to version {0}'.format(version))
+        wdx &= (workflows.version_parse == version)
+    elif (version is None):
+        # either min or max version is given
+        workflow_version = {}
+        if min_version is not None:
+            workflow_version['min'] = min_version
+            min_version = packaging.version.parse(min_version)
+            wdx &= (workflows.version_parse >= min_version)
+        if max_version is not None:
+            workflow_version['max'] = max_version
+            max_version = packaging.version.parse(max_version)
+            wdx &= (workflows.version_parse <= max_version)
+    else:
+        # version is given
+        workflow_version = version
+        version = packaging.version.parse(version)
+        if version.minor == 0:
+            next_version = packaging.version.parse(str(version.major + 1))
+            wdx &= (workflows.version_parse >= version)
+            wdx &= (workflows.version_parse < next_version)
+        else:
+            wdx &= (workflows.version_parse == version)
 
-    wdx &= (workflows.minor_version == minor_version)
-    assert (wdx.sum() > 0), 'workflow ID and workflow version combination does not exist'
-    assert (wdx.sum() == 1), 'workflow ID and workflow version combination is not unique'
-    workflow = workflows[wdx].iloc[0]
+    assert (wdx.sum() > 0), 'workflow ID and workflow version(s) combination does not exist'
+    # configure off of the latest workflow when given a range
+    configure_version = workflows[wdx].version_parse.max()
+    configure_version_loc = np.argmax(workflows[wdx].version_parse)
+    if (wdx.sum() > 1) and verbose:
+        warnings.warn('A workflow range was specified, configuration is based on {0}'.format(configure_version))
+    workflow = workflows[wdx].iloc[configure_version_loc]
     workflow_tasks = json.loads(workflow.tasks)
     extractor_config = workflow_extractor_config(workflow_tasks, keywords=keywords)
-    workflow_version = '{0}.{1}'.format(version, minor_version)
     config = {
         'workflow_id': workflow_id,
         'workflow_version': workflow_version,
         'extractor_config': extractor_config
     }
     # configure the extractors
-    filename = 'Extractor_config_workflow_{0}_V{1}.yaml'.format(workflow_id, workflow_version)
+    filename = 'Extractor_config_workflow_{0}_V{1}.yaml'.format(workflow_id, configure_version)
     if output_dir is not None:
         filename = os.path.join(output_dir, filename)
     with open(filename, 'w', encoding='utf-8') as stream:
@@ -69,7 +97,7 @@ def config_workflow(
         reducer_config = {
             'reducer_config': reducer
         }
-        filename = 'Reducer_config_workflow_{0}_V{1}_{2}.yaml'.format(workflow_id, workflow_version, extractor)
+        filename = 'Reducer_config_workflow_{0}_V{1}_{2}.yaml'.format(workflow_id, configure_version, extractor)
         if output_dir is not None:
             filename = os.path.join(output_dir, filename)
         with open(filename, 'w', encoding='utf-8') as stream:
@@ -90,7 +118,7 @@ def config_workflow(
                 dropdown_label_hash = workflow_tasks[task_id][selects][int(selects_idx)][options][star][int(star_idx)]['value']
                 dropdown_label = strings_extract[dropdown_string_key]
                 strings_extract[dropdown_string_key] = {dropdown_label_hash: dropdown_label}
-    filename = 'Task_labels_workflow_{0}_V{1}.yaml'.format(workflow_id, workflow_version)
+    filename = 'Task_labels_workflow_{0}_V{1}.yaml'.format(workflow_id, configure_version)
     if output_dir is not None:
         filename = os.path.join(output_dir, filename)
     with open(filename, 'w', encoding='utf-8') as stream:

diff --git a/panoptes_aggregation/scripts/extract_panoptes_csv.py b/panoptes_aggregation/scripts/extract_panoptes_csv.py
@@ -1,5 +1,7 @@
 from collections import OrderedDict, defaultdict
 from multiprocessing import Pool
+import numpy as np
+import packaging.version
 import copy
 import json
 import io
@@ -24,10 +26,6 @@ def get_file_instance(file):
     return file
 
 
-def get_major_version(s):
-    return s.split('.')[0]
-
-
 def extract_classification(
     classification_by_task,
     classification_info,
@@ -93,8 +91,26 @@ def extract_csv(
 
     extractor_config = config_yaml['extractor_config']
     workflow_id = config_yaml['workflow_id']
-    version = config_yaml['workflow_version']
-    number_of_extractors = sum([len(value) for key, value in extractor_config.items()])
+    if isinstance(config_yaml['workflow_version'], dict):
+        # a version range was given
+        version_range = config_yaml['workflow_version']
+        for key, value in version_range.items():
+            version_range[key] = packaging.version.parse(value)
+    else:
+        # a single version is given
+        version = packaging.version.parse(config_yaml['workflow_version'])
+        if version.minor == 0:
+            # only a major version given, take all rows with the same major version
+            # note, the max is inclusive, but there are no workflows with a minor
+            # version of 0, so that is OK here
+            next_version = packaging.version.parse(str(version.major + 1))
+        else:
+            next_version = version
+        version_range = {
+            'min': version,
+            'max': next_version
+        }
+    number_of_extractors = sum([len(value) for _, value in extractor_config.items()])
 
     extracted_data = defaultdict(list)
 
@@ -104,13 +120,16 @@ def extract_csv(
 
     wdx = classifications.workflow_id == workflow_id
     assert (wdx.sum() > 0), 'There are no classifications matching the configured workflow ID'
-    if '.' in version:
-        vdx = classifications.workflow_version == version
-    else:
-        vdx = classifications.workflow_version.apply(get_major_version) == version
 
-    assert (vdx.sum() > 0), 'There are no classificaitons matching the configured version number'
-    assert ((vdx & wdx).sum() > 0), 'There are no classifications matching the combined workflow ID and version number'
+    classifications.workflow_version = classifications.workflow_version.apply(packaging.version.parse)
+    vdx = np.ones_like(classifications.workflow_version, dtype=bool)
+    if 'min' in version_range:
+        vdx &= classifications.workflow_version >= version_range['min']
+    if 'max' in version_range:
+        vdx &= classifications.workflow_version <= version_range['max']
+
+    assert (vdx.sum() > 0), 'There are no classifications matching the configured version number(s)'
+    assert ((vdx & wdx).sum() > 0), 'There are no classifications matching the combined workflow ID and version number(s)'
 
     widgets = [
         'Extracting: ',

diff --git a/panoptes_aggregation/tests/scripts_tests/test_aggregation_parser.py b/panoptes_aggregation/tests/scripts_tests/test_aggregation_parser.py
@@ -15,8 +15,9 @@ def test_config_called(self, mock_config_workflow, mock_FileType):
             mock_FileType.return_value.return_value,
             123,
             keywords={},
-            minor_version=None,
             version=None,
+            min_version=None,
+            max_version=None,
             output_dir=os.getcwd(),
             verbose=False
         )