open-edge-platform
diff --git a/‎.github/workflows/github_pages.yml
Lines changed: 8 additions & 1 deletion b/‎.github/workflows/github_pages.yml
Lines changed: 8 additions & 1 deletion
diff --git a/‎.github/workflows/linter.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/linter.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pylintrc
Lines changed: 2 additions & 2 deletions b/‎.pylintrc
Lines changed: 2 additions & 2 deletions
diff --git a/‎CHANGELOG.md
Lines changed: 23 additions & 1 deletion b/‎CHANGELOG.md
Lines changed: 23 additions & 1 deletion
diff --git a/‎README.md
Lines changed: 1 addition & 0 deletions b/‎README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎datumaro/cli/__main__.py
Lines changed: 5 additions & 0 deletions b/‎datumaro/cli/__main__.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎datumaro/cli/commands/__init__.py
Lines changed: 1 addition & 0 deletions b/‎datumaro/cli/commands/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎datumaro/cli/commands/describe_downloads.py
Lines changed: 135 additions & 0 deletions b/‎datumaro/cli/commands/describe_downloads.py
Lines changed: 135 additions & 0 deletions
diff --git a/‎datumaro/cli/commands/detect_format.py
Lines changed: 5 additions & 5 deletions b/‎datumaro/cli/commands/detect_format.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎datumaro/cli/commands/download.py
Lines changed: 16 additions & 12 deletions b/‎datumaro/cli/commands/download.py
Lines changed: 16 additions & 12 deletions
@@ -30,10 +30,17 @@ jobs:
         run: |
           npm ci
 
+      # The pip upgrade must be in a separate step, because otherwise bash will
+      # remember where the system-installed pip was, and will use it in any following
+      # commands instead of the newly-installed pip.
+      - name: Upgrade pip
+        run: |
+          pip install --upgrade pip
+
       - name: Build docs
         run: |
           pip install gitpython packaging toml Sphinx==4.2.0 sphinx-rtd-theme==1.0.0 sphinx-copybutton==0.4.0 \
-            tensorflow openvino-dev[accuracy_check] sphinxcontrib-mermaid
+            tensorflow openvino-dev sphinxcontrib-mermaid
           pip install -r requirements.txt
           pip install git+https://github.com/pytorch-ignite/sphinxcontrib-versioning.git@a1a1a94ca80a0233f0df3eaf9876812484901e76
           sphinx-versioning -l site/source/conf.py build -r develop -w develop site/source site/static/api
 
@@ -50,7 +50,7 @@ jobs:
         run: |
           pip install --user -r <(grep "^pylint" ./requirements.txt)
           echo "Pylint version: "`pylint --version | head -1`
-          git ls-files -z '*.py' | xargs -0 pylint -j 0 -r n
+          git ls-files -z '*.py' | xargs -0 pylint
   remark:
     runs-on: ubuntu-latest
     steps:
 
@@ -20,7 +20,7 @@ persistent=yes
 load-plugins=
 
 # Use multiple processes to speed up Pylint.
-jobs=1
+jobs=0
 
 # Allow loading of arbitrary C extensions. Extensions are imported into the
 # active Python interpreter and may run arbitrary code.
@@ -173,7 +173,7 @@ enable=
 output-format=text
 
 # Tells whether to display a full report or only the messages
-reports=yes
+reports=no
 
 # Python expression which should return a note less than 10 (10 is the highest
 # note). You have access to the variables errors warning, statement which
 
@@ -15,8 +15,29 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/539>)
 - BraTS format (import-only) (.npy and .nii.gz), new `MultiframeImage`
   media type (<https://github.com/openvinotoolkit/datumaro/pull/628>)
+- Common Semantic Segmentation dataset format (import-only)
+  (<https://github.com/openvinotoolkit/datumaro/pull/685>)
+- An option to disable `data/` prefix inclusion in YOLO export
+  (<https://github.com/openvinotoolkit/datumaro/pull/689>)
+- New command `describe-downloads` to print information about downloadable datasets
+  (<https://github.com/openvinotoolkit/datumaro/pull/678>)
+- Detection for Cityscapes format
+  (<https://github.com/openvinotoolkit/datumaro/pull/680>)
+- Maximum recursion `--depth` parameter for `detect-dataset` CLI command
+  (<https://github.com/openvinotoolkit/datumaro/pull/680>)
+- An option to save a single subset in the `download` command
+  (<https://github.com/openvinotoolkit/datumaro/pull/697>)
 
 ### Changed
+- `env.detect_dataset()` now returns a list of detected formats at all recursion levels
+  instead of just the lowest one
+  (<https://github.com/openvinotoolkit/datumaro/pull/680>)
+- Open Images: allowed to store annotations file in root path as well
+  (<https://github.com/openvinotoolkit/datumaro/pull/680>)
+- Improved parsing error messages in COCO, VOC and YOLO formats
+  (<https://github.com/openvinotoolkit/datumaro/pull/684>,
+   <https://github.com/openvinotoolkit/datumaro/pull/686>,
+   <https://github.com/openvinotoolkit/datumaro/pull/687>)
 - YOLO format now supports almost any subset names, except of
   just `train` and `valid`
   (<https://github.com/openvinotoolkit/datumaro/pull/688>)
@@ -32,7 +53,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - TBD
 
 ### Fixed
-- TBD
+- Detection for LFW format
+  (<https://github.com/openvinotoolkit/datumaro/pull/680>)
 
 ### Security
 - TBD
 
@@ -20,6 +20,7 @@ CVAT annotations                             ---> Publication, statistics etc.
 - [Examples](https://openvinotoolkit.github.io/datumaro/docs/getting_started/#examples)
 - [Features](#features)
 - [User manual](https://openvinotoolkit.github.io/datumaro/docs/user-manual)
+- [Developer manual](https://openvinotoolkit.github.io/datumaro/api)
 - [Contributing](#contributing)
 
 ## Features
 
@@ -93,6 +93,11 @@ def _get_known_commands():
         ("", None, ""),
         ("Dataset operations:", None, ""),
         ("convert", commands.convert, "Convert dataset between formats"),
+        (
+            "describe-downloads",
+            commands.describe_downloads,
+            "Print information about downloadable datasets",
+        ),
         ("detect-format", commands.detect_format, "Detect the format of a dataset"),
         ("diff", commands.diff, "Compare datasets"),
         ("download", commands.download, "Download a publicly available dataset"),
 
@@ -10,6 +10,7 @@
     commit,
     convert,
     create,
+    describe_downloads,
     detect_format,
     diff,
     download,
 
@@ -0,0 +1,135 @@
+# Copyright (C) 2022 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+import argparse
+import contextlib
+import sys
+from typing import Dict, Type
+
+from datumaro.components.extractor_tfds import (
+    AVAILABLE_TFDS_DATASETS,
+    TFDS_EXTRACTOR_AVAILABLE,
+    TfdsDatasetRemoteMetadata,
+)
+from datumaro.util import dump_json
+
+from ..util import MultilineFormatter
+
+
+def build_parser(
+    parser_ctor: Type[argparse.ArgumentParser] = argparse.ArgumentParser,
+):
+    parser = parser_ctor(
+        help="Print information about downloadable datasets",
+        description="""
+        Reports information about datasets that can be downloaded with the
+        "datum download" command. The information is reported either as
+        human-readable text (the default) or as a JSON object.
+        """,
+        formatter_class=MultilineFormatter,
+    )
+
+    parser.add_argument(
+        "--report-format",
+        choices=("text", "json"),
+        default="text",
+        help="Format in which to report the information (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--report-file", help="File to which to write the report (default: standard output)"
+    )
+    parser.set_defaults(command=describe_downloads_command)
+
+    return parser
+
+
+def get_sensitive_args():
+    return {
+        describe_downloads_command: ["report-file"],
+    }
+
+
+def describe_downloads_command(args):
+    dataset_metas: Dict[str, TfdsDatasetRemoteMetadata] = {}
+
+    if TFDS_EXTRACTOR_AVAILABLE:
+        for dataset_name, dataset in AVAILABLE_TFDS_DATASETS.items():
+            dataset_metas[f"tfds:{dataset_name}"] = dataset.query_remote_metadata()
+
+    if args.report_format == "text":
+        with (
+            open(args.report_file, "w") if args.report_file else contextlib.nullcontext(sys.stdout)
+        ) as report_file:
+            if dataset_metas:
+                print("Available datasets:", file=report_file)
+
+                for name, meta in sorted(dataset_metas.items()):
+                    print(file=report_file)
+                    print(f"{name} ({meta.human_name}):", file=report_file)
+                    print(
+                        f"  default output format: {meta.default_output_format}",
+                        file=report_file,
+                    )
+
+                    print("  description:", file=report_file)
+                    for line in meta.description.rstrip("\n").split("\n"):
+                        print(f"    {line}", file=report_file)
+
+                    print(f"  download size: {meta.download_size} bytes", file=report_file)
+                    print(f"  home URL: {meta.home_url or 'N/A'}", file=report_file)
+                    print(f"  number of classes: {meta.num_classes}", file=report_file)
+                    print("  subsets:", file=report_file)
+                    for subset_name, subset_meta in sorted(meta.subsets.items()):
+                        print(f"    {subset_name}: {subset_meta.num_items} items", file=report_file)
+                    print(f"  version: {meta.version}", file=report_file)
+            else:
+                print("No datasets available.", file=report_file)
+                print(file=report_file)
+                print(
+                    "You can enable TFDS datasets by installing "
+                    "TensorFlow and TensorFlow Datasets:",
+                    file=report_file,
+                )
+                print("    pip install datumaro[tf,tfds]", file=report_file)
+
+    elif args.report_format == "json":
+
+        def meta_to_raw(meta: TfdsDatasetRemoteMetadata):
+            raw = {}
+
+            # We omit the media type from the output, because there is currently no mechanism
+            # for mapping media types to strings. The media type could be useful information
+            # for users, though, so we might want to implement such a mechanism eventually.
+
+            for attribute in (
+                "default_output_format",
+                "description",
+                "download_size",
+                "home_url",
+                "human_name",
+                "num_classes",
+                "version",
+            ):
+                raw[attribute] = getattr(meta, attribute)
+
+            raw["subsets"] = {
+                name: {"num_items": subset.num_items} for name, subset in meta.subsets.items()
+            }
+
+            return raw
+
+        with (
+            open(args.report_file, "wb")
+            if args.report_file
+            else contextlib.nullcontext(sys.stdout.buffer)
+        ) as report_file:
+            report_file.write(
+                dump_json(
+                    {name: meta_to_raw(meta) for name, meta in dataset_metas.items()},
+                    indent=True,
+                    append_newline=True,
+                )
+            )
+    else:
+        assert False, "unreachable code"
@@ -8,7 +8,7 @@
 from datumaro.cli.util.project import load_project
 from datumaro.components.environment import Environment
 from datumaro.components.errors import ProjectNotFoundError
-from datumaro.components.format_detection import RejectionReason, detect_dataset_format
+from datumaro.components.format_detection import RejectionReason
 from datumaro.util import dump_json_file
 from datumaro.util.scope import scope_add, scoped
 
@@ -53,6 +53,7 @@ def build_parser(parser_ctor=argparse.ArgumentParser):
         help="Path to which to save a JSON report describing detected "
         "and rejected formats. By default, no report is saved.",
     )
+    parser.add_argument("--depth", help="The maximum depth for recursive search (default: 2) ")
     parser.set_defaults(command=detect_format_command)
 
     return parser
@@ -90,10 +91,9 @@ def rejection_callback(
             "message": human_message,
         }
 
-    detected_formats = detect_dataset_format(
-        ((format_name, importer.detect) for format_name, importer in env.importers.items.items()),
-        args.url,
-        rejection_callback=rejection_callback,
+    depth = 2 if not args.depth else int(args.depth)
+    detected_formats = env.detect_dataset(
+        args.url, rejection_callback=rejection_callback, depth=depth
     )
     report["detected_formats"] = detected_formats
 
 
@@ -1,4 +1,4 @@
-# Copyright (C) 2021 Intel Corporation
+# Copyright (C) 2021-2022 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
@@ -7,11 +7,7 @@
 import os
 import os.path as osp
 
-from datumaro.components.extractor_tfds import (
-    AVAILABLE_TFDS_DATASETS,
-    TFDS_EXTRACTOR_AVAILABLE,
-    make_tfds_extractor,
-)
+from datumaro.components.extractor_tfds import AVAILABLE_TFDS_DATASETS, TFDS_EXTRACTOR_AVAILABLE
 from datumaro.components.project import Environment
 from datumaro.util.os_util import make_file_name
 
@@ -40,7 +36,8 @@ def build_parser(parser_ctor=argparse.ArgumentParser):
         |n
         Supported datasets: {}|n
         |n
-        For information about the datasets, see the TFDS Catalog:
+        For information about the datasets, run "datum describe-downloads".
+        More detailed information can be found in the TFDS Catalog:
         <https://www.tensorflow.org/datasets/catalog/overview>.|n
         |n
         Supported output formats: {}|n
@@ -71,6 +68,7 @@ def build_parser(parser_ctor=argparse.ArgumentParser):
     parser.add_argument(
         "--overwrite", action="store_true", help="Overwrite existing files in the save directory"
     )
+    parser.add_argument("-s", "--subset", help="Save only the specified subset")
     parser.add_argument(
         "extra_args",
         nargs=argparse.REMAINDER,
@@ -94,10 +92,10 @@ def download_command(args):
     if args.dataset_id.startswith("tfds:"):
         if TFDS_EXTRACTOR_AVAILABLE:
             tfds_ds_name = args.dataset_id[5:]
-            tfds_ds_metadata = AVAILABLE_TFDS_DATASETS.get(tfds_ds_name)
-            if tfds_ds_metadata:
-                default_converter_name = tfds_ds_metadata.default_converter_name
-                extractor_factory = lambda: make_tfds_extractor(tfds_ds_name)
+            tfds_ds = AVAILABLE_TFDS_DATASETS.get(tfds_ds_name)
+            if tfds_ds:
+                default_output_format = tfds_ds.metadata.default_output_format
+                extractor_factory = tfds_ds.make_extractor
             else:
                 raise CliException(f"Unsupported TFDS dataset '{tfds_ds_name}'")
         else:
@@ -109,7 +107,7 @@ def download_command(args):
     else:
         raise CliException(f"Unknown dataset ID '{args.dataset_id}'")
 
-    output_format = args.output_format or default_converter_name
+    output_format = args.output_format or default_output_format
 
     try:
         converter = env.converters[output_format]
@@ -136,6 +134,12 @@ def download_command(args):
     log.info("Downloading the dataset")
     extractor = extractor_factory()
 
+    if args.subset:
+        try:
+            extractor = extractor.subsets()[args.subset]
+        except KeyError:
+            raise CliException("Subset '%s' is not present in the dataset" % args.subset)
+
     log.info("Exporting the dataset")
     converter.convert(extractor, dst_dir, default_image_ext=".png", **extra_args)