Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow deleting a subset/config from a no-script dataset #6820

Merged
merged 15 commits into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions docs/source/cli.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ positional arguments:
run_beam Run a Beam dataset processing pipeline
dummy_data Generate dummy data.
convert_to_parquet Convert dataset to Parquet
delete_from_hub Delete dataset config from the Hub

optional arguments:
-h, --help show this help message and exit
Expand Down Expand Up @@ -60,3 +61,35 @@ Do not forget that you need to log in first to your Hugging Face account:
```

</Tip>

## Delete from Hub

Delete a dataset configuration from a [data-only dataset](repository_structure) on the Hub.

```bash
>>> datasets-cli delete_from_hub --help
usage: datasets-cli <command> [<args>] delete_from_hub [-h] [--token TOKEN] [--revision REVISION] dataset_id config_name

positional arguments:
dataset_id source dataset ID, e.g. USERNAME/DATASET_NAME or ORGANIZATION/DATASET_NAME
config_name config name to delete

optional arguments:
-h, --help show this help message and exit
--token TOKEN access token to the Hugging Face Hub
--revision REVISION source revision
```

For example:
```bash
>>> datasets-cli delete_from_hub USERNAME/DATASET_NAME CONFIG_NAME
```

<Tip>

Do not forget that you need to log in first to your Hugging Face account:
```bash
>>> huggingface-cli login
```

</Tip>
2 changes: 2 additions & 0 deletions src/datasets/commands/datasets_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from datasets.commands.convert import ConvertCommand
from datasets.commands.convert_to_parquet import ConvertToParquetCommand
from datasets.commands.delete_from_hub import DeleteFromHubCommand
from datasets.commands.dummy_data import DummyDataCommand
from datasets.commands.env import EnvironmentCommand
from datasets.commands.run_beam import RunBeamCommand
Expand All @@ -28,6 +29,7 @@ def main():
RunBeamCommand.register_subcommand(commands_parser)
DummyDataCommand.register_subcommand(commands_parser)
ConvertToParquetCommand.register_subcommand(commands_parser)
DeleteFromHubCommand.register_subcommand(commands_parser)

# Parse args
args, unknown_args = parser.parse_known_args()
Expand Down
42 changes: 42 additions & 0 deletions src/datasets/commands/delete_from_hub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from argparse import ArgumentParser
from typing import Optional

from datasets.commands import BaseDatasetsCLICommand
from datasets.hub import delete_from_hub


def _command_factory(args):
return DeleteFromHubCommand(
args.dataset_id,
args.config_name,
args.token,
args.revision,
)


class DeleteFromHubCommand(BaseDatasetsCLICommand):
@staticmethod
def register_subcommand(parser):
parser: ArgumentParser = parser.add_parser("delete_from_hub", help="Delete dataset config from the Hub")
parser.add_argument(
"dataset_id", help="source dataset ID, e.g. USERNAME/DATASET_NAME or ORGANIZATION/DATASET_NAME"
)
parser.add_argument("config_name", help="config name to delete")
parser.add_argument("--token", help="access token to the Hugging Face Hub")
parser.add_argument("--revision", help="source revision")
parser.set_defaults(func=_command_factory)

def __init__(
self,
dataset_id: str,
config_name: str,
token: Optional[str],
revision: Optional[str],
):
self._dataset_id = dataset_id
self._config_name = config_name
self._token = token
self._revision = revision

def run(self) -> None:
_ = delete_from_hub(self._dataset_id, self._config_name, revision=self._revision, token=self._token)
87 changes: 87 additions & 0 deletions src/datasets/hub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from itertools import chain
from typing import Optional, Union

from huggingface_hub import (
CommitInfo,
CommitOperationAdd,
CommitOperationDelete,
DatasetCard,
DatasetCardData,
HfApi,
HfFileSystem,
)

from datasets import config
from datasets.info import DatasetInfosDict
from datasets.load import load_dataset_builder
from datasets.utils.metadata import MetadataConfigs


def delete_from_hub(
repo_id: str,
config_name: str,
revision: Optional[str] = None,
token: Optional[Union[bool, str]] = None,
) -> CommitInfo:
"""Delete a dataset configuration from a [data-only dataset](repository_structure) on the Hub.

Args:
repo_id (`str`): ID of the Hub dataset repository, in the following format: `<user>/<dataset_name>` or
`<org>/<dataset_name>`.
config_name (`str`): Name of the dataset configuration.
revision (`str`, *optional*): Branch to delete the configuration from. Defaults to the `"main"` branch.
token (`bool` or `str`, *optional*): Authentication token for the Hugging Face Hub.

Returns:
huggingface_hub.CommitInfo
"""
operations = []
# data_files
fs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=token)
builder = load_dataset_builder(repo_id, config_name, revision=revision, token=token, trust_remote_code=False)
for data_file in chain(*builder.config.data_files.values()):
data_file_resolved_path = fs.resolve_path(data_file)
if data_file_resolved_path.repo_id == repo_id:
operations.append(CommitOperationDelete(path_in_repo=data_file_resolved_path.path_in_repo))
# README.md
dataset_card = DatasetCard.load(repo_id)
# config_names
if dataset_card.data.get("config_names", None) and config_name in dataset_card.data["config_names"]:
dataset_card.data["config_names"].remove(config_name)
# metadata_configs
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card.data)
if metadata_configs:
_ = metadata_configs.pop(config_name, None)
dataset_card_data = DatasetCardData()
metadata_configs.to_dataset_card_data(dataset_card_data)
if config.METADATA_CONFIGS_FIELD in dataset_card_data:
dataset_card.data[config.METADATA_CONFIGS_FIELD] = dataset_card_data[config.METADATA_CONFIGS_FIELD]
else:
_ = dataset_card.data.pop(config.METADATA_CONFIGS_FIELD, None)
# dataset_info
dataset_infos: DatasetInfosDict = DatasetInfosDict.from_dataset_card_data(dataset_card.data)
if dataset_infos:
_ = dataset_infos.pop(config_name, None)
dataset_card_data = DatasetCardData()
dataset_infos.to_dataset_card_data(dataset_card_data)
if "dataset_info" in dataset_card_data:
dataset_card.data["dataset_info"] = dataset_card_data["dataset_info"]
else:
_ = dataset_card.data.pop("dataset_info", None)
# Commit
operations.append(
CommitOperationAdd(path_in_repo=config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode())
)
api = HfApi(endpoint=config.HF_ENDPOINT, token=token)
commit_info = api.create_commit(
repo_id,
operations=operations,
commit_message=f"Delete '{config_name}' config",
commit_description=f"Delete '{config_name}' config.",
token=token,
repo_type="dataset",
revision=revision,
create_pr=True,
)
print(f"You can find your PR to delete the dataset config at: {commit_info.pr_url}")
return commit_info
67 changes: 67 additions & 0 deletions tests/test_hub.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
from textwrap import dedent
from types import SimpleNamespace
from unittest.mock import patch
from urllib.parse import quote

import pytest
from huggingface_hub import CommitOperationAdd, CommitOperationDelete

import datasets
from datasets.config import METADATA_CONFIGS_FIELD
from datasets.hub import delete_from_hub
from datasets.utils.hub import hf_dataset_url


Expand All @@ -11,3 +18,63 @@
def test_dataset_url(repo_id, filename, revision):
url = hf_dataset_url(repo_id=repo_id, filename=filename, revision=revision)
assert url == f"https://huggingface.co/datasets/{repo_id}/resolve/{revision or 'main'}/{quote(filename)}"


def test_delete_from_hub(
temporary_repo, hf_api, hf_token, csv_path, tmp_path, ci_hub_config, ci_hfh_hf_hub_url
) -> None:
with temporary_repo() as repo_id:
hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset")
hf_api.upload_file(
path_or_fileobj=str(csv_path),
path_in_repo="cats/train/0000.csv",
repo_id=repo_id,
repo_type="dataset",
token=hf_token,
)
hf_api.upload_file(
path_or_fileobj=str(csv_path),
path_in_repo="dogs/train/0000.csv",
repo_id=repo_id,
repo_type="dataset",
token=hf_token,
)
readme_path = tmp_path / "README.md"
readme_path.write_text(
dedent(f"""\
---
{METADATA_CONFIGS_FIELD}:
- config_name: cats
data_files:
- split: train
path: cats/train/*
- config_name: dogs
data_files:
- split: train
path: dogs/train/*
---
""")
)
hf_api.upload_file(
token=hf_token,
path_or_fileobj=str(readme_path),
path_in_repo="README.md",
repo_id=repo_id,
repo_type="dataset",
)
commit_info = SimpleNamespace(
pr_url="https:///hub-ci.huggingface.co/datasets/__DUMMY_USER__/__DUMMY_DATASET__/refs%2Fpr%2F1"
)
with patch.object(datasets.hub.HfApi, "create_commit", return_value=commit_info) as mock_method:
delete_from_hub(repo_id, "dogs")
assert mock_method.called
assert mock_method.call_args.kwargs.get("commit_message") == "Delete 'dogs' config"
assert mock_method.call_args.kwargs.get("create_pr")
expected_operations = [
CommitOperationDelete(path_in_repo="dogs/train/0000.csv", is_folder=False),
CommitOperationAdd(
path_in_repo="README.md",
path_or_fileobj=b"---\nconfigs:\n- config_name: cats\n data_files:\n - split: train\n path: cats/train/*\n---\n",
),
]
assert mock_method.call_args.kwargs.get("operations") == expected_operations
Loading