diff --git a/misc/py-datasets/Makefile b/misc/py-datasets/Makefile index 774f71a63205..b9356b8a6908 100644 --- a/misc/py-datasets/Makefile +++ b/misc/py-datasets/Makefile @@ -1,70 +1,84 @@ PORTNAME= datasets -DISTVERSION= 4.8.2 -PORTREVISION= 1 +DISTVERSION= 4.8.5 CATEGORIES= misc python # machine-learning MASTER_SITES= PYPI PKGNAMEPREFIX= ${PYTHON_PKGNAMEPREFIX} MAINTAINER= yuri@FreeBSD.org COMMENT= HuggingFace community-driven open-source library of datasets -WWW= https://huggingface.co/docs/datasets/index +WWW= https://huggingface.co/docs/datasets/index \ + https://github.com/huggingface/datasets LICENSE= MIT LICENSE_FILE= ${WRKSRC}/LICENSE BUILD_DEPENDS= ${PYTHON_PKGNAMEPREFIX}pyproject-hooks>0:devel/py-pyproject-hooks@${PY_FLAVOR} \ ${PY_SETUPTOOLS} \ ${PYTHON_PKGNAMEPREFIX}wheel>0:devel/py-wheel@${PY_FLAVOR} RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}aiohttp>0:www/py-aiohttp@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}dill>0.3.0:devel/py-dill@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}filelock>0:sysutils/py-filelock@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}fsspec>=2023.1.0:filesystems/py-fsspec@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}httpx>0:www/py-httpx@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}huggingface-hub>=0.25.0:misc/py-huggingface-hub@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}multiprocess>0:devel/py-multiprocess@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}numpy1>=1.16:math/py-numpy1@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}packaging>0:devel/py-packaging@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}pandas>0:math/py-pandas@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}pyarrow>=21.0.0:databases/py-pyarrow@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}pyyaml>=5.1:devel/py-pyyaml@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}requests>=2.32.2:www/py-requests@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}tqdm>=4.66.3:misc/py-tqdm@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}xxhash>0:devel/py-xxhash@${PY_FLAVOR} -RUN_DEPENDS_AUDIO= \ - ${PYTHON_PKGNAMEPREFIX}torchcodec>=0.6.0:multimedia/py-torchcodec@${PY_FLAVOR} \ +RUN_DEPENDS_AUDIO= ${PYTHON_PKGNAMEPREFIX}torchcodec>=0.6.0:multimedia/py-torchcodec@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}pytorch>=2.8.0:misc/py-pytorch@${PY_FLAVOR} -RUN_DEPENDS_VISION= \ - ${PY_PILLOW} +RUN_DEPENDS_VISION= ${PY_PILLOW} RUN_DEPENDS+= ${RUN_DEPENDS_AUDIO} \ ${RUN_DEPENDS_VISION} TEST_DEPENDS= ${PYTHON_PKGNAMEPREFIX}absl-py>=0:devel/py-absl-py@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}elasticsearch>0:textproc/py-elasticsearch@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}faiss>=1.6.4:math/py-faiss@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}jax>=0.3.14:math/py-jax@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}jiwer>0:misc/py-jiwer@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}joblib>=1.3.0:devel/py-joblib@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}lz4>=0:archivers/py-lz4@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}polars>=0.20.0:misc/py-polars@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}protobuf>=4.0.0:devel/py-protobuf@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}pytest-datadir>=0:devel/py-pytest-datadir@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}pytest-xdist>=0:devel/py-pytest-xdist@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}pytest>=0:devel/py-pytest@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}pytorch>=2.0.0:misc/py-pytorch@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}rarfile>=4.0:archivers/py-rarfile@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}s3fs>=2021.11.1:filesystems/py-s3fs@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}tiktoken>=0:textproc/py-tiktoken@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}typing-extensions>=4.6.1:devel/py-typing-extensions@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}zstandard>=0:archivers/py-zstandard@${PY_FLAVOR} -# missing TEST_DEPENDS: jaxlib, joblibspark, py7zr, pyspark, tensorflow +# missing TEST_DEPENDS: jaxlib, joblibspark, py7zr, pyspark, tensorflow, transformers USES= python USE_PYTHON= pep517 concurrent autoplist pytest -TEST_ENV= ${MAKE_ENV} PYTHONPATH=${STAGEDIR}${PYTHONPREFIX_SITELIBDIR} +TEST_ENV= ${MAKE_ENV} PYTHONPATH=${STAGEDIR}${PYTHONPREFIX_SITELIBDIR} \ + DATASETS_TEST_SKIP_TF=yes +PYTEST_ARGS= tests/ NO_ARCH= yes -pre-test: # prevent failure due to missing pyspark - @${RM} ${WRKSRC}/tests/packaged_modules/test_spark.py +pre-patch: + @${MKDIR} ${WRKSRC}/tests/fixtures + @${TOUCH} ${WRKSRC}/tests/__init__.py ${WRKSRC}/tests/fixtures/__init__.py + +pre-test: # skip tests requiring unavailable dependencies/data + @${RM} ${WRKSRC}/tests/test_fingerprint_tokenizer_stability.py + @${RM} ${WRKSRC}/tests/test_formatting.py + @${RM} ${WRKSRC}/tests/test_load.py + @${RM} ${WRKSRC}/tests/test_distributed.py + @${RM} ${WRKSRC}/tests/test_search.py + @${RM} ${WRKSRC}/tests/test_table.py + @${RM} ${WRKSRC}/tests/test_builder.py + @${RM} ${WRKSRC}/tests/test_parallel.py + @${RM} ${WRKSRC}/tests/test_iterable_dataset.py + @${RM} ${WRKSRC}/tests/test_upstream_hub.py + @${RM} ${WRKSRC}/tests/test_fingerprint.py .include diff --git a/misc/py-datasets/distinfo b/misc/py-datasets/distinfo index 612ac838107b..19c878e8b494 100644 --- a/misc/py-datasets/distinfo +++ b/misc/py-datasets/distinfo @@ -1,3 +1,3 @@ -TIMESTAMP = 1773758107 -SHA256 (datasets-4.8.2.tar.gz) = c6ad7e6c28c7436a9c6c23f817d1a450d395c771df881252dfe63697297cbcdf -SIZE (datasets-4.8.2.tar.gz) = 603879 +TIMESTAMP = 1777403895 +SHA256 (datasets-4.8.5.tar.gz) = 0f0c1c3d56ffff2c93b2f4c63c95bac94f3d7e8621aea2a2a576275233bba772 +SIZE (datasets-4.8.5.tar.gz) = 605649 diff --git a/misc/py-datasets/files/patch-tests__test_patching.py b/misc/py-datasets/files/patch-tests__test_patching.py new file mode 100644 index 000000000000..6beda41ed21b --- /dev/null +++ b/misc/py-datasets/files/patch-tests__test_patching.py @@ -0,0 +1,17 @@ +-- This patch adds tests/_test_patching.py which is missing from the PyPI source distribution. +-- The file is taken from the GitHub repository at the same version tag. +-- Without this file, the test suite cannot be run. +--- /dev/null ++++ tests/_test_patching.py +@@ -0,0 +1,11 @@ ++# ruff: noqa: F401 ++# This is the module that test_patching.py uses to test patch_submodule() ++import os ++import os as renamed_os ++from os import path ++from os import path as renamed_path ++from os.path import join ++from os.path import join as renamed_join ++ ++ ++open = open # we just need to have a builtin inside this module to test it properly diff --git a/misc/py-datasets/files/patch-tests_conftest.py b/misc/py-datasets/files/patch-tests_conftest.py new file mode 100644 index 000000000000..248e9b692e63 --- /dev/null +++ b/misc/py-datasets/files/patch-tests_conftest.py @@ -0,0 +1,68 @@ +-- This patch adds tests/conftest.py which is missing from the PyPI source distribution. +-- The file is taken from the GitHub repository at the same version tag. +-- Without this file, the test suite cannot be run. +--- /dev/null ++++ tests/conftest.py +@@ -0,0 +1,62 @@ ++import pytest ++ ++import datasets ++import datasets.config ++ ++ ++# Import fixture modules as plugins ++pytest_plugins = ["tests.fixtures.files", "tests.fixtures.hub", "tests.fixtures.fsspec"] ++ ++ ++def pytest_collection_modifyitems(config, items): ++ # Mark tests as "unit" by default if not marked as "integration" (or already marked as "unit") ++ for item in items: ++ if any(marker in item.keywords for marker in ["integration", "unit"]): ++ continue ++ item.add_marker(pytest.mark.unit) ++ ++ ++@pytest.fixture(autouse=True) ++def set_test_cache_config(tmp_path_factory, monkeypatch): ++ # test_hf_cache_home = tmp_path_factory.mktemp("cache") # TODO: why a cache dir per test function does not work? ++ test_hf_cache_home = tmp_path_factory.getbasetemp() / "cache" ++ test_hf_datasets_cache = test_hf_cache_home / "datasets" ++ monkeypatch.setattr("datasets.config.HF_DATASETS_CACHE", str(test_hf_datasets_cache)) ++ test_downloaded_datasets_path = test_hf_datasets_cache / "downloads" ++ monkeypatch.setattr("datasets.config.DOWNLOADED_DATASETS_PATH", str(test_downloaded_datasets_path)) ++ test_extracted_datasets_path = test_hf_datasets_cache / "downloads" / "extracted" ++ monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_PATH", str(test_extracted_datasets_path)) ++ ++ # used in dataset viewer, we may set it to true by default in the future ++ monkeypatch.setattr("datasets.config.SAVE_ORIGINAL_SHARD_LENGTHS", True) ++ ++ ++@pytest.fixture(autouse=True) ++def disable_implicit_token(monkeypatch): ++ monkeypatch.setattr("huggingface_hub.constants.HF_HUB_DISABLE_IMPLICIT_TOKEN", True) ++ ++ ++@pytest.fixture(autouse=True, scope="session") ++def disable_tqdm_output(): ++ datasets.disable_progress_bar() ++ ++ ++@pytest.fixture(autouse=True) ++def set_update_download_counts_to_false(monkeypatch): ++ # don't take tests into account when counting downloads ++ monkeypatch.setattr("datasets.config.HF_UPDATE_DOWNLOAD_COUNTS", False) ++ ++ ++@pytest.fixture ++def set_sqlalchemy_silence_uber_warning(monkeypatch): ++ # Required to suppress RemovedIn20Warning when feature(s) are not compatible with SQLAlchemy 2.0 ++ # To be removed once SQLAlchemy 2.0 supported ++ try: ++ monkeypatch.setattr("sqlalchemy.util.deprecations.SILENCE_UBER_WARNING", True) ++ except (ModuleNotFoundError, AttributeError): ++ pass ++ ++ ++@pytest.fixture(autouse=True, scope="session") ++def zero_time_out_for_remote_code(): ++ datasets.config.TIME_OUT_REMOTE_CODE = 0 diff --git a/misc/py-datasets/files/patch-tests_fixtures_files.py b/misc/py-datasets/files/patch-tests_fixtures_files.py new file mode 100644 index 000000000000..7053267f2eaa --- /dev/null +++ b/misc/py-datasets/files/patch-tests_fixtures_files.py @@ -0,0 +1,636 @@ +-- This patch adds tests/fixtures/files.py which is missing from the PyPI source distribution. +-- The file is taken from the GitHub repository at the same version tag. +-- Without this file, the test suite cannot be run. +--- /dev/null ++++ tests/fixtures/files.py +@@ -0,0 +1,630 @@ ++import contextlib ++import csv ++import json ++import os ++import sqlite3 ++import tarfile ++import textwrap ++import zipfile ++ ++import pandas as pd ++import pyarrow as pa ++import pyarrow.parquet as pq ++import pytest ++ ++import datasets ++import datasets.config ++ ++ ++# dataset + arrow_file ++ ++ ++@pytest.fixture(scope="session") ++def dataset(): ++ n = 10 ++ features = datasets.Features( ++ { ++ "tokens": datasets.List(datasets.Value("string")), ++ "labels": datasets.List(datasets.ClassLabel(names=["negative", "positive"])), ++ "answers": { ++ "text": datasets.List(datasets.Value("string")), ++ "answer_start": datasets.List(datasets.Value("int32")), ++ }, ++ "id": datasets.Value("int64"), ++ } ++ ) ++ dataset = datasets.Dataset.from_dict( ++ { ++ "tokens": [["foo"] * 5] * n, ++ "labels": [[1] * 5] * n, ++ "answers": [{"answer_start": [97], "text": ["1976"]}] * 10, ++ "id": list(range(n)), ++ }, ++ features=features, ++ ) ++ return dataset ++ ++ ++@pytest.fixture(scope="session") ++def arrow_file(tmp_path_factory, dataset): ++ filename = str(tmp_path_factory.mktemp("data") / "file.arrow") ++ dataset.map(cache_file_name=filename) ++ return filename ++ ++ ++# FILE_CONTENT + files ++ ++ ++FILE_CONTENT = """\ ++ Text data. ++ Second line of data.""" ++ ++ ++@pytest.fixture(scope="session") ++def text_file_content(): ++ return FILE_CONTENT ++ ++ ++@pytest.fixture(scope="session") ++def text_file(tmp_path_factory): ++ filename = tmp_path_factory.mktemp("data") / "file.txt" ++ data = FILE_CONTENT ++ with open(filename, "w") as f: ++ f.write(data) ++ return filename ++ ++ ++@pytest.fixture(scope="session") ++def bz2_file(tmp_path_factory): ++ import bz2 ++ ++ path = tmp_path_factory.mktemp("data") / "file.txt.bz2" ++ data = bytes(FILE_CONTENT, "utf-8") ++ with bz2.open(path, "wb") as f: ++ f.write(data) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def gz_file(tmp_path_factory): ++ import gzip ++ ++ path = str(tmp_path_factory.mktemp("data") / "file.txt.gz") ++ data = bytes(FILE_CONTENT, "utf-8") ++ with gzip.open(path, "wb") as f: ++ f.write(data) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def lz4_file(tmp_path_factory): ++ if datasets.config.LZ4_AVAILABLE: ++ import lz4.frame ++ ++ path = tmp_path_factory.mktemp("data") / "file.txt.lz4" ++ data = bytes(FILE_CONTENT, "utf-8") ++ with lz4.frame.open(path, "wb") as f: ++ f.write(data) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def seven_zip_file(tmp_path_factory, text_file): ++ if datasets.config.PY7ZR_AVAILABLE: ++ import py7zr ++ ++ path = tmp_path_factory.mktemp("data") / "file.txt.7z" ++ with py7zr.SevenZipFile(path, "w") as archive: ++ archive.write(text_file, arcname=os.path.basename(text_file)) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def tar_file(tmp_path_factory, text_file): ++ import tarfile ++ ++ path = tmp_path_factory.mktemp("data") / "file.txt.tar" ++ with tarfile.TarFile(path, "w") as f: ++ f.add(text_file, arcname=os.path.basename(text_file)) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def xz_file(tmp_path_factory): ++ import lzma ++ ++ path = tmp_path_factory.mktemp("data") / "file.txt.xz" ++ data = bytes(FILE_CONTENT, "utf-8") ++ with lzma.open(path, "wb") as f: ++ f.write(data) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_file(tmp_path_factory, text_file): ++ import zipfile ++ ++ path = tmp_path_factory.mktemp("data") / "file.txt.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(text_file, arcname=os.path.basename(text_file)) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zstd_file(tmp_path_factory): ++ if datasets.config.ZSTANDARD_AVAILABLE: ++ import zstandard as zstd ++ ++ path = tmp_path_factory.mktemp("data") / "file.txt.zst" ++ data = bytes(FILE_CONTENT, "utf-8") ++ with zstd.open(path, "wb") as f: ++ f.write(data) ++ return path ++ ++ ++# xml_file ++ ++ ++@pytest.fixture(scope="session") ++def xml_file(tmp_path_factory): ++ filename = tmp_path_factory.mktemp("data") / "file.xml" ++ data = textwrap.dedent( ++ """\ ++ ++ ++
++ ++ ++ Contingut 1 ++ Content 1 ++ ++ ++ Contingut 2 ++ Content 2 ++ ++ ++ Contingut 3 ++ Content 3 ++ ++ ++ Contingut 4 ++ Content 4 ++ ++ ++ Contingut 5 ++ Content 5 ++ ++ ++ """ ++ ) ++ with open(filename, "w") as f: ++ f.write(data) ++ return filename ++ ++ ++DATA = [ ++ {"col_1": "0", "col_2": 0, "col_3": 0.0}, ++ {"col_1": "1", "col_2": 1, "col_3": 1.0}, ++ {"col_1": "2", "col_2": 2, "col_3": 2.0}, ++ {"col_1": "3", "col_2": 3, "col_3": 3.0}, ++] ++DATA2 = [ ++ {"col_1": "4", "col_2": 4, "col_3": 4.0}, ++ {"col_1": "5", "col_2": 5, "col_3": 5.0}, ++] ++DATA_DICT_OF_LISTS = { ++ "col_1": ["0", "1", "2", "3"], ++ "col_2": [0, 1, 2, 3], ++ "col_3": [0.0, 1.0, 2.0, 3.0], ++} ++ ++DATA_312 = [ ++ {"col_3": 0.0, "col_1": "0", "col_2": 0}, ++ {"col_3": 1.0, "col_1": "1", "col_2": 1}, ++] ++ ++DATA_STR = [ ++ {"col_1": "s0", "col_2": 0, "col_3": 0.0}, ++ {"col_1": "s1", "col_2": 1, "col_3": 1.0}, ++ {"col_1": "s2", "col_2": 2, "col_3": 2.0}, ++ {"col_1": "s3", "col_2": 3, "col_3": 3.0}, ++] ++ ++DATA_MISSING_FIELDS = [ ++ {"col_1": 1, "col_2": 2}, ++ {"col_1": 1, "col_3": 3}, ++] ++ ++DATA_MIXED_TYPES = [ ++ {"col_1": 1, "col_2": {"a": "a"}, "col_3": [{"x": "x"}]}, ++ {"col_1": "one", "col_2": {"b": "b"}, "col_3": [{"y": "y"}]}, ++ {"col_1": None, "col_2": None, "col_3": [None]}, ++] ++ ++ ++@pytest.fixture(scope="session") ++def dataset_dict(): ++ return DATA_DICT_OF_LISTS ++ ++ ++@pytest.fixture(scope="session") ++def arrow_path(tmp_path_factory): ++ dataset = datasets.Dataset.from_dict(DATA_DICT_OF_LISTS) ++ path = str(tmp_path_factory.mktemp("data") / "dataset.arrow") ++ dataset.map(cache_file_name=path) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def sqlite_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset.sqlite") ++ with contextlib.closing(sqlite3.connect(path)) as con: ++ cur = con.cursor() ++ cur.execute("CREATE TABLE dataset(col_1 text, col_2 int, col_3 real)") ++ for item in DATA: ++ cur.execute("INSERT INTO dataset(col_1, col_2, col_3) VALUES (?, ?, ?)", tuple(item.values())) ++ con.commit() ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def csv_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset.csv") ++ with open(path, "w", newline="") as f: ++ writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"]) ++ writer.writeheader() ++ for item in DATA: ++ writer.writerow(item) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def csv2_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset2.csv") ++ with open(path, "w", newline="") as f: ++ writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"]) ++ writer.writeheader() ++ for item in DATA: ++ writer.writerow(item) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def bz2_csv_path(csv_path, tmp_path_factory): ++ import bz2 ++ ++ path = tmp_path_factory.mktemp("data") / "dataset.csv.bz2" ++ with open(csv_path, "rb") as f: ++ data = f.read() ++ # data = bytes(FILE_CONTENT, "utf-8") ++ with bz2.open(path, "wb") as f: ++ f.write(data) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_csv_path(csv_path, csv2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("zip_csv_path") / "csv-dataset.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(csv_path, arcname=os.path.basename(csv_path)) ++ f.write(csv2_path, arcname=os.path.basename(csv2_path)) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_uppercase_csv_path(csv_path, csv2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset.csv.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(csv_path, arcname=os.path.basename(csv_path.replace(".csv", ".CSV"))) ++ f.write(csv2_path, arcname=os.path.basename(csv2_path.replace(".csv", ".CSV"))) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_csv_with_dir_path(csv_path, csv2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset_with_dir.csv.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(csv_path, arcname=os.path.join("main_dir", os.path.basename(csv_path))) ++ f.write(csv2_path, arcname=os.path.join("main_dir", os.path.basename(csv2_path))) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def parquet_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset.parquet") ++ schema = pa.schema( ++ { ++ "col_1": pa.string(), ++ "col_2": pa.int64(), ++ "col_3": pa.float64(), ++ } ++ ) ++ with open(path, "wb") as f: ++ writer = pq.ParquetWriter(f, schema=schema) ++ pa_table = pa.Table.from_pydict({k: [DATA[i][k] for i in range(len(DATA))] for k in DATA[0]}, schema=schema) ++ writer.write_table(pa_table) ++ writer.close() ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def geoparquet_path(tmp_path_factory): ++ df = pd.read_parquet(path="https://github.com/opengeospatial/geoparquet/raw/v1.0.0/examples/example.parquet") ++ path = str(tmp_path_factory.mktemp("data") / "dataset.geoparquet") ++ df.to_parquet(path=path) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def json_list_of_dicts_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset.json") ++ data = {"data": DATA} ++ with open(path, "w") as f: ++ json.dump(data, f) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def json_dict_of_lists_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset.json") ++ data = {"data": DATA_DICT_OF_LISTS} ++ with open(path, "w") as f: ++ json.dump(data, f) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def jsonl_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl") ++ with open(path, "w") as f: ++ for item in DATA: ++ f.write(json.dumps(item) + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def jsonl2_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset2.jsonl") ++ with open(path, "w") as f: ++ for item in DATA: ++ f.write(json.dumps(item) + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def jsonl_312_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset_312.jsonl") ++ with open(path, "w") as f: ++ for item in DATA_312: ++ f.write(json.dumps(item) + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def jsonl_str_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset-str.jsonl") ++ with open(path, "w") as f: ++ for item in DATA_STR: ++ f.write(json.dumps(item) + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def jsonl_missing_fields_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset-missing-fields.jsonl") ++ with open(path, "w") as f: ++ for item in DATA_MISSING_FIELDS: ++ f.write(json.dumps(item) + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def jsonl_mixed_types_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset-mixed-types.jsonl") ++ with open(path, "w") as f: ++ for item in DATA_MIXED_TYPES: ++ f.write(json.dumps(item) + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def text_gz_path(tmp_path_factory, text_path): ++ import gzip ++ ++ path = str(tmp_path_factory.mktemp("data") / "dataset.txt.gz") ++ with open(text_path, "rb") as orig_file: ++ with gzip.open(path, "wb") as zipped_file: ++ zipped_file.writelines(orig_file) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def jsonl_gz_path(tmp_path_factory, jsonl_path): ++ import gzip ++ ++ path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl.gz") ++ with open(jsonl_path, "rb") as orig_file: ++ with gzip.open(path, "wb") as zipped_file: ++ zipped_file.writelines(orig_file) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_jsonl_path(jsonl_path, jsonl2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset.jsonl.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(jsonl_path, arcname=os.path.basename(jsonl_path)) ++ f.write(jsonl2_path, arcname=os.path.basename(jsonl2_path)) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_nested_jsonl_path(zip_jsonl_path, jsonl_path, jsonl2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset_nested.jsonl.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(zip_jsonl_path, arcname=os.path.join("nested", os.path.basename(zip_jsonl_path))) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_jsonl_with_dir_path(jsonl_path, jsonl2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset_with_dir.jsonl.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(jsonl_path, arcname=os.path.join("main_dir", os.path.basename(jsonl_path))) ++ f.write(jsonl2_path, arcname=os.path.join("main_dir", os.path.basename(jsonl2_path))) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def tar_jsonl_path(jsonl_path, jsonl2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset.jsonl.tar" ++ with tarfile.TarFile(path, "w") as f: ++ f.add(jsonl_path, arcname=os.path.basename(jsonl_path)) ++ f.add(jsonl2_path, arcname=os.path.basename(jsonl2_path)) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def tar_nested_jsonl_path(tar_jsonl_path, jsonl_path, jsonl2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset_nested.jsonl.tar" ++ with tarfile.TarFile(path, "w") as f: ++ f.add(tar_jsonl_path, arcname=os.path.join("nested", os.path.basename(tar_jsonl_path))) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def text_path(tmp_path_factory): ++ data = ["0", "1", "2", "3"] ++ path = str(tmp_path_factory.mktemp("data") / "dataset.txt") ++ with open(path, "w") as f: ++ for item in data: ++ f.write(item + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def text2_path(tmp_path_factory): ++ data = ["0", "1", "2", "3"] ++ path = str(tmp_path_factory.mktemp("data") / "dataset2.txt") ++ with open(path, "w") as f: ++ for item in data: ++ f.write(item + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def text_dir(tmp_path_factory): ++ data = ["0", "1", "2", "3"] ++ path = tmp_path_factory.mktemp("data_text_dir") / "dataset.txt" ++ with open(path, "w") as f: ++ for item in data: ++ f.write(item + "\n") ++ return path.parent ++ ++ ++@pytest.fixture(scope="session") ++def text_dir_with_unsupported_extension(tmp_path_factory): ++ data = ["0", "1", "2", "3"] ++ path = tmp_path_factory.mktemp("data") / "dataset.abc" ++ with open(path, "w") as f: ++ for item in data: ++ f.write(item + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_text_path(text_path, text2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset.text.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(text_path, arcname=os.path.basename(text_path)) ++ f.write(text2_path, arcname=os.path.basename(text2_path)) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_text_with_dir_path(text_path, text2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset_with_dir.text.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(text_path, arcname=os.path.join("main_dir", os.path.basename(text_path))) ++ f.write(text2_path, arcname=os.path.join("main_dir", os.path.basename(text2_path))) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_unsupported_ext_path(text_path, text2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset.ext.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(text_path, arcname=os.path.basename("unsupported.ext")) ++ f.write(text2_path, arcname=os.path.basename("unsupported_2.ext")) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def text_path_with_unicode_new_lines(tmp_path_factory): ++ text = "\n".join(["First", "Second\u2029with Unicode new line", "Third"]) ++ path = str(tmp_path_factory.mktemp("data") / "dataset_with_unicode_new_lines.txt") ++ with open(path, "w", encoding="utf-8") as f: ++ f.write(text) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def image_file(): ++ return os.path.join("tests", "features", "data", "test_image_rgb.jpg") ++ ++ ++@pytest.fixture(scope="session") ++def audio_file(): ++ return os.path.join("tests", "features", "data", "test_audio_44100.wav") ++ ++ ++@pytest.fixture(scope="session") ++def audio_file_44100(): ++ return os.path.join("tests", "features", "data", "test_audio_44100.mp3") ++ ++ ++@pytest.fixture(scope="session") ++def audio_file_16000(): ++ return os.path.join("tests", "features", "data", "test_audio_16000.mp3") ++ ++ ++@pytest.fixture(scope="session") ++def tensor_file(tmp_path_factory): ++ import torch ++ ++ path = tmp_path_factory.mktemp("data") / "tensor.pth" ++ with open(path, "wb") as f: ++ torch.save(torch.ones(128), f) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_image_path(image_file, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset.img.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(image_file, arcname=os.path.basename(image_file)) ++ f.write(image_file, arcname=os.path.basename(image_file).replace(".jpg", "2.jpg")) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def data_dir_with_hidden_files(tmp_path_factory): ++ data_dir = tmp_path_factory.mktemp("data_dir") ++ ++ (data_dir / "subdir").mkdir() ++ with open(data_dir / "subdir" / "train.txt", "w") as f: ++ f.write("foo\n" * 10) ++ with open(data_dir / "subdir" / "test.txt", "w") as f: ++ f.write("bar\n" * 10) ++ # hidden file ++ with open(data_dir / "subdir" / ".test.txt", "w") as f: ++ f.write("bar\n" * 10) ++ ++ # hidden directory ++ (data_dir / ".subdir").mkdir() ++ with open(data_dir / ".subdir" / "train.txt", "w") as f: ++ f.write("foo\n" * 10) ++ with open(data_dir / ".subdir" / "test.txt", "w") as f: ++ f.write("bar\n" * 10) ++ ++ return data_dir diff --git a/misc/py-datasets/files/patch-tests_fixtures_fsspec.py b/misc/py-datasets/files/patch-tests_fixtures_fsspec.py new file mode 100644 index 000000000000..311541e7a5dd --- /dev/null +++ b/misc/py-datasets/files/patch-tests_fixtures_fsspec.py @@ -0,0 +1,119 @@ +-- This patch adds tests/fixtures/fsspec.py which is missing from the PyPI source distribution. +-- The file is taken from the GitHub repository at the same version tag. +-- Without this file, the test suite cannot be run. +--- /dev/null ++++ tests/fixtures/fsspec.py +@@ -0,0 +1,113 @@ ++import posixpath ++from pathlib import Path ++from unittest.mock import patch ++ ++import pytest ++from fsspec.implementations.local import AbstractFileSystem, LocalFileSystem, stringify_path ++from fsspec.registry import _registry as _fsspec_registry ++ ++ ++class MockFileSystem(AbstractFileSystem): ++ protocol = "mock" ++ ++ def __init__(self, *args, local_root_dir, **kwargs): ++ super().__init__() ++ self._fs = LocalFileSystem(*args, **kwargs) ++ self.local_root_dir = Path(local_root_dir).resolve().as_posix() + "/" ++ ++ def mkdir(self, path, *args, **kwargs): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ return self._fs.mkdir(path, *args, **kwargs) ++ ++ def makedirs(self, path, *args, **kwargs): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ return self._fs.makedirs(path, *args, **kwargs) ++ ++ def rmdir(self, path): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ return self._fs.rmdir(path) ++ ++ def ls(self, path, detail=True, *args, **kwargs): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ out = self._fs.ls(path, detail=detail, *args, **kwargs) ++ if detail: ++ return [{**info, "name": info["name"][len(self.local_root_dir) :]} for info in out] ++ else: ++ return [name[len(self.local_root_dir) :] for name in out] ++ ++ def info(self, path, *args, **kwargs): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ out = dict(self._fs.info(path, *args, **kwargs)) ++ out["name"] = out["name"][len(self.local_root_dir) :] ++ return out ++ ++ def cp_file(self, path1, path2, *args, **kwargs): ++ path1 = posixpath.join(self.local_root_dir, self._strip_protocol(path1)) ++ path2 = posixpath.join(self.local_root_dir, self._strip_protocol(path2)) ++ return self._fs.cp_file(path1, path2, *args, **kwargs) ++ ++ def rm_file(self, path, *args, **kwargs): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ return self._fs.rm_file(path, *args, **kwargs) ++ ++ def rm(self, path, *args, **kwargs): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ return self._fs.rm(path, *args, **kwargs) ++ ++ def _open(self, path, *args, **kwargs): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ return self._fs._open(path, *args, **kwargs) ++ ++ def created(self, path): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ return self._fs.created(path) ++ ++ def modified(self, path): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ return self._fs.modified(path) ++ ++ @classmethod ++ def _strip_protocol(cls, path): ++ path = stringify_path(path) ++ if path.startswith("mock://"): ++ path = path[7:] ++ return path ++ ++ ++class TmpDirFileSystem(MockFileSystem): ++ protocol = "tmp" ++ tmp_dir = None ++ ++ def __init__(self, *args, **kwargs): ++ assert self.tmp_dir is not None, "TmpDirFileSystem.tmp_dir is not set" ++ super().__init__(*args, **kwargs, local_root_dir=self.tmp_dir, auto_mkdir=True) ++ ++ @classmethod ++ def _strip_protocol(cls, path): ++ path = stringify_path(path) ++ if path.startswith("tmp://"): ++ path = path[6:] ++ return path ++ ++ ++@pytest.fixture ++def mock_fsspec(): ++ _fsspec_registry["mock"] = MockFileSystem ++ _fsspec_registry["tmp"] = TmpDirFileSystem ++ yield ++ del _fsspec_registry["mock"] ++ del _fsspec_registry["tmp"] ++ ++ ++@pytest.fixture ++def mockfs(tmp_path_factory, mock_fsspec): ++ local_fs_dir = tmp_path_factory.mktemp("mockfs") ++ return MockFileSystem(local_root_dir=local_fs_dir, auto_mkdir=True) ++ ++ ++@pytest.fixture ++def tmpfs(tmp_path_factory, mock_fsspec): ++ tmp_fs_dir = tmp_path_factory.mktemp("tmpfs") ++ with patch.object(TmpDirFileSystem, "tmp_dir", tmp_fs_dir): ++ yield TmpDirFileSystem() ++ TmpDirFileSystem.clear_instance_cache() diff --git a/misc/py-datasets/files/patch-tests_fixtures_hub.py b/misc/py-datasets/files/patch-tests_fixtures_hub.py new file mode 100644 index 000000000000..771dd0d56344 --- /dev/null +++ b/misc/py-datasets/files/patch-tests_fixtures_hub.py @@ -0,0 +1,235 @@ +-- This patch adds tests/fixtures/hub.py which is missing from the PyPI source distribution. +-- The file is taken from the GitHub repository at the same version tag. +-- Without this file, the test suite cannot be run. +--- /dev/null ++++ tests/fixtures/hub.py +@@ -0,0 +1,229 @@ ++import os ++import time ++import uuid ++from contextlib import contextmanager ++from typing import Optional ++ ++import pytest ++from huggingface_hub.hf_api import HfApi ++from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError ++from huggingface_hub.utils._headers import _http_user_agent ++from packaging import version ++ ++from datasets import config ++ ++ ++if config.HF_HUB_VERSION >= version.parse("1.6.0"): ++ from huggingface_hub.errors import BucketNotFoundError ++ ++else: ++ BucketNotFoundError = None ++ ++CI_HUB_USER = "__DUMMY_DATASETS_USER__" ++CI_HUB_USER_FULL_NAME = "Dummy User" ++CI_HUB_USER_TOKEN = "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" ++ ++CI_HUB_ENDPOINT = "https://hub-ci.huggingface.co" ++CI_HUB_DATASETS_URL = CI_HUB_ENDPOINT + "/datasets/{repo_id}/resolve/{revision}/{path}" ++CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE = CI_HUB_ENDPOINT + "/{repo_id}/resolve/{revision}/{filename}" ++ ++ ++@pytest.fixture ++def ci_hub_config(monkeypatch): ++ monkeypatch.setattr("datasets.config.HF_ENDPOINT", CI_HUB_ENDPOINT) ++ monkeypatch.setattr("datasets.config.HUB_DATASETS_URL", CI_HUB_DATASETS_URL) ++ monkeypatch.setattr("huggingface_hub.constants.HUGGINGFACE_CO_URL_TEMPLATE", CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE) ++ try: ++ # for backward compatibility with huggingface_hub 0.x ++ monkeypatch.setattr( ++ "huggingface_hub.file_download.HUGGINGFACE_CO_URL_TEMPLATE", CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE ++ ) ++ except AttributeError: ++ pass ++ old_environ = dict(os.environ) ++ os.environ["HF_ENDPOINT"] = CI_HUB_ENDPOINT ++ yield ++ os.environ.clear() ++ os.environ.update(old_environ) ++ ++ ++@pytest.fixture ++def set_ci_hub_access_token(ci_hub_config, monkeypatch): ++ # Enable implicit token ++ monkeypatch.setattr("huggingface_hub.constants.HF_HUB_DISABLE_IMPLICIT_TOKEN", False) ++ old_environ = dict(os.environ) ++ os.environ["HF_TOKEN"] = CI_HUB_USER_TOKEN ++ os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "0" ++ yield ++ os.environ.clear() ++ os.environ.update(old_environ) ++ ++ ++def _http_ci_user_agent(*args, **kwargs): ++ ua = _http_user_agent(*args, **kwargs) ++ return ua + os.environ.get("CI_HEADERS", "") ++ ++ ++@pytest.fixture(autouse=True) ++def set_hf_ci_headers(monkeypatch): ++ old_environ = dict(os.environ) ++ os.environ["TRANSFORMERS_IS_CI"] = "1" ++ monkeypatch.setattr("huggingface_hub.utils._headers._http_user_agent", _http_ci_user_agent) ++ yield ++ os.environ.clear() ++ os.environ.update(old_environ) ++ ++ ++@pytest.fixture(scope="session") ++def hf_api(): ++ return HfApi(endpoint=CI_HUB_ENDPOINT) ++ ++ ++@pytest.fixture(scope="session") ++def hf_token(): ++ yield CI_HUB_USER_TOKEN ++ ++ ++@pytest.fixture ++def cleanup_repo(hf_api: HfApi): ++ def _cleanup_repo(repo_id): ++ hf_api.delete_repo(repo_id, token=CI_HUB_USER_TOKEN, repo_type="dataset") ++ ++ return _cleanup_repo ++ ++ ++@pytest.fixture ++def cleanup_bucket(hf_api: HfApi): ++ def _cleanup_bucket(bucket_id): ++ hf_api.delete_bucket(bucket_id, token=CI_HUB_USER_TOKEN) ++ ++ return _cleanup_bucket ++ ++ ++@pytest.fixture ++def temporary_repo(cleanup_repo): ++ @contextmanager ++ def _temporary_repo(repo_id: Optional[str] = None): ++ repo_id = repo_id or f"{CI_HUB_USER}/test-dataset-{uuid.uuid4().hex[:6]}-{int(time.time() * 10e3)}" ++ try: ++ yield repo_id ++ finally: ++ try: ++ cleanup_repo(repo_id) ++ except RepositoryNotFoundError: ++ pass ++ ++ return _temporary_repo ++ ++ ++@pytest.fixture ++def temporary_bucket(cleanup_bucket): ++ @contextmanager ++ def _temporary_bucket(bucket_id: Optional[str] = None): ++ bucket_id = bucket_id or f"{CI_HUB_USER}/test-bucket-{uuid.uuid4().hex[:6]}-{int(time.time() * 10e3)}" ++ try: ++ yield bucket_id ++ finally: ++ try: ++ cleanup_bucket(bucket_id) ++ except BucketNotFoundError: ++ pass ++ ++ return _temporary_bucket ++ ++ ++@pytest.fixture(scope="session") ++def _hf_gated_dataset_repo_txt_data(hf_api: HfApi, hf_token, text_file_content): ++ repo_name = f"repo_txt_data-{int(time.time() * 10e6)}" ++ repo_id = f"{CI_HUB_USER}/{repo_name}" ++ hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset") ++ hf_api.upload_file( ++ token=hf_token, ++ path_or_fileobj=text_file_content.encode(), ++ path_in_repo="data/text_data.txt", ++ repo_id=repo_id, ++ repo_type="dataset", ++ ) ++ hf_api.update_repo_settings(repo_id, token=hf_token, repo_type="dataset", gated="auto") ++ yield repo_id ++ try: ++ hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset") ++ except (HfHubHTTPError, ValueError): # catch http error and token invalid error ++ pass ++ ++ ++@pytest.fixture() ++def hf_gated_dataset_repo_txt_data(_hf_gated_dataset_repo_txt_data, ci_hub_config): ++ return _hf_gated_dataset_repo_txt_data ++ ++ ++@pytest.fixture(scope="session") ++def hf_private_dataset_repo_txt_data_(hf_api: HfApi, hf_token, text_file_content): ++ repo_name = f"repo_txt_data-{int(time.time() * 10e6)}" ++ repo_id = f"{CI_HUB_USER}/{repo_name}" ++ hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset", private=True) ++ hf_api.upload_file( ++ token=hf_token, ++ path_or_fileobj=text_file_content.encode(), ++ path_in_repo="data/text_data.txt", ++ repo_id=repo_id, ++ repo_type="dataset", ++ ) ++ yield repo_id ++ try: ++ hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset") ++ except (HfHubHTTPError, ValueError): # catch http error and token invalid error ++ pass ++ ++ ++@pytest.fixture() ++def hf_private_dataset_repo_txt_data(hf_private_dataset_repo_txt_data_, ci_hub_config): ++ return hf_private_dataset_repo_txt_data_ ++ ++ ++@pytest.fixture(scope="session") ++def hf_private_dataset_repo_zipped_txt_data_(hf_api: HfApi, hf_token, zip_csv_with_dir_path): ++ repo_name = f"repo_zipped_txt_data-{int(time.time() * 10e6)}" ++ repo_id = f"{CI_HUB_USER}/{repo_name}" ++ hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset", private=True) ++ hf_api.upload_file( ++ token=hf_token, ++ path_or_fileobj=str(zip_csv_with_dir_path), ++ path_in_repo="data.zip", ++ repo_id=repo_id, ++ repo_type="dataset", ++ ) ++ yield repo_id ++ try: ++ hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset") ++ except (HfHubHTTPError, ValueError): # catch http error and token invalid error ++ pass ++ ++ ++@pytest.fixture() ++def hf_private_dataset_repo_zipped_txt_data(hf_private_dataset_repo_zipped_txt_data_, ci_hub_config): ++ return hf_private_dataset_repo_zipped_txt_data_ ++ ++ ++@pytest.fixture(scope="session") ++def hf_private_dataset_repo_zipped_img_data_(hf_api: HfApi, hf_token, zip_image_path): ++ repo_name = f"repo_zipped_img_data-{int(time.time() * 10e6)}" ++ repo_id = f"{CI_HUB_USER}/{repo_name}" ++ hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset", private=True) ++ hf_api.upload_file( ++ token=hf_token, ++ path_or_fileobj=str(zip_image_path), ++ path_in_repo="data.zip", ++ repo_id=repo_id, ++ repo_type="dataset", ++ ) ++ yield repo_id ++ try: ++ hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset") ++ except (HfHubHTTPError, ValueError): # catch http error and token invalid error ++ pass ++ ++ ++@pytest.fixture() ++def hf_private_dataset_repo_zipped_img_data(hf_private_dataset_repo_zipped_img_data_, ci_hub_config): ++ return hf_private_dataset_repo_zipped_img_data_ diff --git a/misc/py-datasets/files/patch-tests_utils.py b/misc/py-datasets/files/patch-tests_utils.py new file mode 100644 index 000000000000..fa46c80d083f --- /dev/null +++ b/misc/py-datasets/files/patch-tests_utils.py @@ -0,0 +1,626 @@ +-- This patch adds tests/utils.py which is missing from the PyPI source distribution. +-- The file is taken from the GitHub repository at the same version tag (4.8.5). +-- Without this file, the test suite cannot be run. +--- /dev/null ++++ tests/utils.py +@@ -0,0 +1,620 @@ ++import asyncio ++import importlib.metadata ++import os ++import re ++import sys ++import tempfile ++import unittest ++from contextlib import contextmanager ++from copy import deepcopy ++from distutils.util import strtobool ++from enum import Enum ++from importlib.util import find_spec ++from pathlib import Path ++from unittest.mock import Mock, patch ++ ++import httpx ++import pyarrow as pa ++import pytest ++import requests ++from packaging import version ++ ++from datasets import config ++ ++ ++def parse_flag_from_env(key, default=False): ++ try: ++ value = os.environ[key] ++ except KeyError: ++ # KEY isn't set, default to `default`. ++ _value = default ++ else: ++ # KEY is set, convert it to True or False. ++ try: ++ _value = strtobool(value) ++ except ValueError: ++ # More values are supported, but let's keep the message simple. ++ raise ValueError(f"If set, {key} must be yes or no.") ++ return _value ++ ++ ++_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False) ++_run_remote_tests = parse_flag_from_env("RUN_REMOTE", default=False) ++_run_local_tests = parse_flag_from_env("RUN_LOCAL", default=True) ++_run_packaged_tests = parse_flag_from_env("RUN_PACKAGED", default=True) ++ ++# Compression ++require_lz4 = pytest.mark.skipif(not config.LZ4_AVAILABLE, reason="test requires lz4") ++require_py7zr = pytest.mark.skipif(not config.PY7ZR_AVAILABLE, reason="test requires py7zr") ++require_zstandard = pytest.mark.skipif(not config.ZSTANDARD_AVAILABLE, reason="test requires zstandard") ++ ++# Dill-cloudpickle compatibility ++require_dill_gt_0_3_2 = pytest.mark.skipif( ++ config.DILL_VERSION <= version.parse("0.3.2"), ++ reason="test requires dill>0.3.2 for cloudpickle compatibility", ++) ++ ++# Windows ++require_not_windows = pytest.mark.skipif( ++ sys.platform == "win32", ++ reason="test should not be run on Windows", ++) ++ ++ ++require_faiss = pytest.mark.skipif(find_spec("faiss") is None or sys.platform == "win32", reason="test requires faiss") ++require_moto = pytest.mark.skipif(find_spec("moto") is None, reason="test requires moto") ++require_numpy1_on_windows = pytest.mark.skipif( ++ version.parse(importlib.metadata.version("numpy")) >= version.parse("2.0.0") and sys.platform == "win32", ++ reason="test requires numpy < 2.0 on windows", ++) ++ ++IS_HF_HUB_1_x = config.HF_HUB_VERSION >= version.parse("0.99") # clunky but works with pre-releases ++ ++ ++def require_buckets_support_in_huggingface_hub(test_case): ++ """ ++ Decorator marking a test that requires buckets support in huggingface_hub. ++ ++ These tests are skipped when huggingface_hub's version doesn't support buckets. ++ ++ """ ++ try: ++ from huggingface_hub.utils import BucketNotFoundError # noqa ++ except ImportError: ++ test_case = unittest.skip("test requires buckets support in huggingface_hub")(test_case) ++ return test_case ++ ++ ++def require_regex(test_case): ++ """ ++ Decorator marking a test that requires regex. ++ ++ These tests are skipped when Regex isn't installed. ++ ++ """ ++ try: ++ import regex # noqa ++ except ImportError: ++ test_case = unittest.skip("test requires regex")(test_case) ++ return test_case ++ ++ ++def require_elasticsearch(test_case): ++ """ ++ Decorator marking a test that requires ElasticSearch. ++ ++ These tests are skipped when ElasticSearch isn't installed. ++ ++ """ ++ try: ++ import elasticsearch # noqa ++ except ImportError: ++ test_case = unittest.skip("test requires elasticsearch")(test_case) ++ return test_case ++ ++ ++def require_sqlalchemy(test_case): ++ """ ++ Decorator marking a test that requires SQLAlchemy. ++ ++ These tests are skipped when SQLAlchemy isn't installed. ++ ++ """ ++ try: ++ import sqlalchemy # noqa ++ except ImportError: ++ test_case = unittest.skip("test requires sqlalchemy")(test_case) ++ return test_case ++ ++ ++def require_torch(test_case): ++ """ ++ Decorator marking a test that requires PyTorch. ++ ++ These tests are skipped when PyTorch isn't installed. ++ ++ """ ++ if not config.TORCH_AVAILABLE: ++ test_case = unittest.skip("test requires PyTorch")(test_case) ++ return test_case ++ ++ ++def require_torch_compile(test_case): ++ """ ++ Decorator marking a test that requires PyTorch. ++ ++ These tests are skipped when PyTorch isn't installed. ++ ++ """ ++ if not config.TORCH_AVAILABLE: ++ test_case = unittest.skip("test requires PyTorch")(test_case) ++ if config.PY_VERSION >= version.parse("3.14"): ++ test_case = unittest.skip("test requires torch compile which isn't available in python 3.14")(test_case) ++ return test_case ++ ++ ++def require_polars(test_case): ++ """ ++ Decorator marking a test that requires Polars. ++ ++ These tests are skipped when Polars isn't installed. ++ ++ """ ++ if not config.POLARS_AVAILABLE: ++ test_case = unittest.skip("test requires Polars")(test_case) ++ return test_case ++ ++ ++def require_tf(test_case): ++ """ ++ Decorator marking a test that requires TensorFlow. ++ ++ These tests are skipped when TensorFlow isn't installed. ++ ++ """ ++ if not config.TF_AVAILABLE or os.environ.get("DATASETS_TEST_SKIP_TF"): ++ test_case = unittest.skip("test requires TensorFlow")(test_case) ++ return test_case ++ ++ ++def require_jax(test_case): ++ """ ++ Decorator marking a test that requires JAX. ++ ++ These tests are skipped when JAX isn't installed. ++ ++ """ ++ if not config.JAX_AVAILABLE: ++ test_case = unittest.skip("test requires JAX")(test_case) ++ return test_case ++ ++ ++def require_pil(test_case): ++ """ ++ Decorator marking a test that requires Pillow. ++ ++ These tests are skipped when Pillow isn't installed. ++ ++ """ ++ if not config.PIL_AVAILABLE: ++ test_case = unittest.skip("test requires Pillow")(test_case) ++ return test_case ++ ++ ++def require_torchvision(test_case): ++ """ ++ Decorator marking a test that requires torchvision. ++ ++ These tests are skipped when torchvision isn't installed. ++ ++ """ ++ if not config.TORCHVISION_AVAILABLE: ++ test_case = unittest.skip("test requires torchvision")(test_case) ++ return test_case ++ ++ ++def require_torchcodec(test_case): ++ """ ++ Decorator marking a test that requires torchcodec. ++ ++ These tests are skipped when torchcodec isn't installed. ++ ++ """ ++ if not config.TORCHCODEC_AVAILABLE: ++ test_case = unittest.skip("test requires torchcodec")(test_case) ++ return test_case ++ ++ ++def require_pdfplumber(test_case): ++ """ ++ Decorator marking a test that requires pdfplumber. ++ ++ These tests are skipped when decord isn't installed. ++ ++ """ ++ if not config.PDFPLUMBER_AVAILABLE: ++ test_case = unittest.skip("test requires pdfplumber")(test_case) ++ return test_case ++ ++ ++def require_nibabel(test_case): ++ """ ++ Decorator marking a test that requires nibabel. ++ ++ These tests are skipped when nibabel isn't installed. ++ ++ """ ++ if not config.NIBABEL_AVAILABLE: ++ test_case = unittest.skip("test requires nibabel")(test_case) ++ return test_case ++ ++ ++def require_transformers(test_case): ++ """ ++ Decorator marking a test that requires transformers. ++ ++ These tests are skipped when transformers isn't installed. ++ ++ """ ++ try: ++ import transformers # noqa F401 ++ except ImportError: ++ return unittest.skip("test requires transformers")(test_case) ++ else: ++ return test_case ++ ++ ++def require_tiktoken(test_case): ++ """ ++ Decorator marking a test that requires tiktoken. ++ ++ These tests are skipped when transformers isn't installed. ++ ++ """ ++ try: ++ import tiktoken # noqa F401 ++ except ImportError: ++ return unittest.skip("test requires tiktoken")(test_case) ++ else: ++ return test_case ++ ++ ++def require_spacy(test_case): ++ """ ++ Decorator marking a test that requires spacy. ++ ++ These tests are skipped when they aren't installed. ++ ++ """ ++ try: ++ import spacy # noqa F401 ++ except ImportError: ++ return unittest.skip("test requires spacy")(test_case) ++ else: ++ return test_case ++ ++ ++def require_pyspark(test_case): ++ """ ++ Decorator marking a test that requires pyspark. ++ ++ These tests are skipped when pyspark isn't installed. ++ ++ """ ++ try: ++ import pyspark # noqa F401 ++ except ImportError: ++ return unittest.skip("test requires pyspark")(test_case) ++ else: ++ return test_case ++ ++ ++def require_joblibspark(test_case): ++ """ ++ Decorator marking a test that requires joblibspark. ++ ++ These tests are skipped when pyspark isn't installed. ++ ++ """ ++ try: ++ import joblibspark # noqa F401 ++ except ImportError: ++ return unittest.skip("test requires joblibspark")(test_case) ++ else: ++ return test_case ++ ++ ++def require_torchdata_stateful_dataloader(test_case): ++ """ ++ Decorator marking a test that requires torchdata.stateful_dataloader. ++ ++ These tests are skipped when torchdata with stateful_dataloader module isn't installed. ++ ++ """ ++ try: ++ import torchdata.stateful_dataloader # noqa F401 ++ except (ImportError, AssertionError): ++ return unittest.skip("test requires torchdata.stateful_dataloader")(test_case) ++ else: ++ return test_case ++ ++ ++def slow(test_case): ++ """ ++ Decorator marking a test as slow. ++ ++ Slow tests are skipped by default. Set the RUN_SLOW environment variable ++ to a truthy value to run them. ++ ++ """ ++ if not _run_slow_tests or _run_slow_tests == 0: ++ test_case = unittest.skip("test is slow")(test_case) ++ return test_case ++ ++ ++def local(test_case): ++ """ ++ Decorator marking a test as local ++ ++ Local tests are run by default. Set the RUN_LOCAL environment variable ++ to a falsy value to not run them. ++ """ ++ if not _run_local_tests or _run_local_tests == 0: ++ test_case = unittest.skip("test is local")(test_case) ++ return test_case ++ ++ ++def packaged(test_case): ++ """ ++ Decorator marking a test as packaged ++ ++ Packaged tests are run by default. Set the RUN_PACKAGED environment variable ++ to a falsy value to not run them. ++ """ ++ if not _run_packaged_tests or _run_packaged_tests == 0: ++ test_case = unittest.skip("test is packaged")(test_case) ++ return test_case ++ ++ ++def remote(test_case): ++ """ ++ Decorator marking a test as one that relies on GitHub or the Hugging Face Hub. ++ ++ Remote tests are skipped by default. Set the RUN_REMOTE environment variable ++ to a falsy value to not run them. ++ """ ++ if not _run_remote_tests or _run_remote_tests == 0: ++ test_case = unittest.skip("test requires remote")(test_case) ++ return test_case ++ ++ ++def for_all_test_methods(*decorators): ++ def decorate(cls): ++ for name, fn in cls.__dict__.items(): ++ if callable(fn) and name.startswith("test"): ++ for decorator in decorators: ++ fn = decorator(fn) ++ setattr(cls, name, fn) ++ return cls ++ ++ return decorate ++ ++ ++class RequestWouldHangIndefinitelyError(Exception): ++ pass ++ ++ ++class OfflineSimulationMode(Enum): ++ CONNECTION_FAILS = 0 ++ CONNECTION_TIMES_OUT = 1 ++ HF_HUB_OFFLINE_SET_TO_1 = 2 ++ ++ ++@contextmanager ++def offline(mode: OfflineSimulationMode): ++ """ ++ Simulate offline mode. ++ ++ There are three offline simulation modes: ++ ++ CONNECTION_FAILS (default mode): a ConnectionError is raised for each network call. ++ CONNECTION_TIMES_OUT: a ReadTimeout or ConnectTimeout is raised for each network call. ++ HF_HUB_OFFLINE_SET_TO_1: the HF_HUB_OFFLINE_SET_TO_1 environment variable is set to 1. ++ This makes the http/ftp calls of the library instantly fail and raise an OfflineModeEnabled error. ++ ++ The raised exceptions are either from the `requests` library (if `huggingface_hub<1.0.0`) ++ or from the `httpx` library (if `huggingface_hub>=1.0.0`). ++ """ ++ # Enable offline mode ++ if mode is OfflineSimulationMode.HF_HUB_OFFLINE_SET_TO_1: ++ with patch("datasets.config.HF_HUB_OFFLINE", True): ++ yield ++ return ++ ++ # Determine which exception to raise based on mode ++ ++ def error_response(*args, **kwargs): ++ if mode is OfflineSimulationMode.CONNECTION_FAILS: ++ exc = httpx.ConnectError if IS_HF_HUB_1_x else requests.ConnectionError ++ elif mode is OfflineSimulationMode.CONNECTION_TIMES_OUT: ++ if kwargs.get("timeout") is None: ++ raise RequestWouldHangIndefinitelyError( ++ "Tried an HTTP call in offline mode with no timeout set. Please set a timeout." ++ ) ++ exc = httpx.ReadTimeout if IS_HF_HUB_1_x else requests.ConnectTimeout ++ else: ++ raise ValueError("Please use a value from the OfflineSimulationMode enum.") ++ raise exc(f"Offline mode {mode}") ++ ++ # Patch all client methods to raise the appropriate error ++ client_mock = Mock() ++ for method in ["head", "get", "post", "put", "delete", "request", "stream"]: ++ setattr(client_mock, method, Mock(side_effect=error_response)) ++ ++ # Patching is slightly different depending on hfh internals ++ patch_target = ( ++ {"target": "huggingface_hub.utils._http._GLOBAL_CLIENT", "new": client_mock} ++ if IS_HF_HUB_1_x ++ else { ++ "target": "huggingface_hub.utils._http._get_session_from_cache", ++ "return_value": client_mock, ++ } ++ ) ++ with patch(**patch_target): ++ yield ++ ++ ++@contextmanager ++def set_current_working_directory_to_temp_dir(*args, **kwargs): ++ original_working_dir = str(Path().resolve()) ++ with tempfile.TemporaryDirectory(*args, **kwargs) as tmp_dir: ++ try: ++ os.chdir(tmp_dir) ++ yield ++ finally: ++ os.chdir(original_working_dir) ++ ++ ++@contextmanager ++def assert_arrow_memory_increases(): ++ import gc ++ ++ gc.collect() ++ previous_allocated_memory = pa.total_allocated_bytes() ++ yield ++ assert pa.total_allocated_bytes() - previous_allocated_memory > 0, "Arrow memory didn't increase." ++ ++ ++@contextmanager ++def assert_arrow_memory_doesnt_increase(): ++ import gc ++ ++ gc.collect() ++ previous_allocated_memory = pa.total_allocated_bytes() ++ yield ++ assert pa.total_allocated_bytes() - previous_allocated_memory <= 0, "Arrow memory wasn't expected to increase." ++ ++ ++def is_rng_equal(rng1, rng2): ++ return deepcopy(rng1).integers(0, 100, 10).tolist() == deepcopy(rng2).integers(0, 100, 10).tolist() ++ ++ ++def xfail_if_500_502_http_error(func): ++ import decorator ++ ++ def _wrapper(func, *args, **kwargs): ++ try: ++ return func(*args, **kwargs) ++ except (requests.HTTPError, httpx.HTTPError) as err: ++ if str(err).startswith("500") or str(err).startswith("502"): ++ pytest.xfail(str(err)) ++ raise err ++ ++ return decorator.decorator(_wrapper, func) ++ ++ ++# --- distributed testing functions --- # ++ ++# copied from transformers ++# originally adapted from https://stackoverflow.com/a/59041913/9201239 ++ ++ ++class _RunOutput: ++ def __init__(self, returncode, stdout, stderr): ++ self.returncode = returncode ++ self.stdout = stdout ++ self.stderr = stderr ++ ++ ++async def _read_stream(stream, callback): ++ while True: ++ line = await stream.readline() ++ if line: ++ callback(line) ++ else: ++ break ++ ++ ++async def _stream_subprocess(cmd, env=None, stdin=None, timeout=None, quiet=False, echo=False) -> _RunOutput: ++ if echo: ++ print("\nRunning: ", " ".join(cmd)) ++ ++ p = await asyncio.create_subprocess_exec( ++ cmd[0], ++ *cmd[1:], ++ stdin=stdin, ++ stdout=asyncio.subprocess.PIPE, ++ stderr=asyncio.subprocess.PIPE, ++ env=env, ++ ) ++ ++ # note: there is a warning for a possible deadlock when using `wait` with huge amounts of data in the pipe ++ # https://docs.python.org/3/library/asyncio-subprocess.html#asyncio.asyncio.subprocess.Process.wait ++ # ++ # If it starts hanging, will need to switch to the following code. The problem is that no data ++ # will be seen until it's done and if it hangs for example there will be no debug info. ++ # out, err = await p.communicate() ++ # return _RunOutput(p.returncode, out, err) ++ ++ out = [] ++ err = [] ++ ++ def tee(line, sink, pipe, label=""): ++ line = line.decode("utf-8").rstrip() ++ sink.append(line) ++ if not quiet: ++ print(label, line, file=pipe) ++ ++ # XXX: the timeout doesn't seem to make any difference here ++ await asyncio.wait( ++ [ ++ _read_stream(p.stdout, lambda line: tee(line, out, sys.stdout, label="stdout:")), ++ _read_stream(p.stderr, lambda line: tee(line, err, sys.stderr, label="stderr:")), ++ ], ++ timeout=timeout, ++ ) ++ return _RunOutput(await p.wait(), out, err) ++ ++ ++def execute_subprocess_async(cmd, env=None, stdin=None, timeout=180, quiet=False, echo=True) -> _RunOutput: ++ loop = asyncio.get_event_loop() ++ result = loop.run_until_complete( ++ _stream_subprocess(cmd, env=env, stdin=stdin, timeout=timeout, quiet=quiet, echo=echo) ++ ) ++ ++ cmd_str = " ".join(cmd) ++ if result.returncode > 0: ++ stderr = "\n".join(result.stderr) ++ raise RuntimeError( ++ f"'{cmd_str}' failed with returncode {result.returncode}\n\n" ++ f"The combined stderr from workers follows:\n{stderr}" ++ ) ++ ++ # check that the subprocess actually did run and produced some output, should the test rely on ++ # the remote side to do the testing ++ if not result.stdout and not result.stderr: ++ raise RuntimeError(f"'{cmd_str}' produced no output.") ++ ++ return result ++ ++ ++def pytest_xdist_worker_id(): ++ """ ++ Returns an int value of worker's numerical id under `pytest-xdist`'s concurrent workers `pytest -n N` regime, or 0 ++ if `-n 1` or `pytest-xdist` isn't being used. ++ """ ++ worker = os.environ.get("PYTEST_XDIST_WORKER", "gw0") ++ worker = re.sub(r"^gw", "", worker, count=0, flags=re.M) ++ return int(worker) ++ ++ ++def get_torch_dist_unique_port(): ++ """ ++ Returns a port number that can be fed to `torchrun`'s `--master_port` argument. ++ ++ Under `pytest-xdist` it adds a delta number based on a worker id so that concurrent tests don't try to use the same ++ port at once. ++ """ ++ port = 29500 ++ uniq_delta = pytest_xdist_worker_id() ++ return port + uniq_delta