diff --git a/misc/py-datasets/Makefile b/misc/py-datasets/Makefile
index 774f71a63205..b9356b8a6908 100644
--- a/misc/py-datasets/Makefile
+++ b/misc/py-datasets/Makefile
@@ -1,70 +1,84 @@
 PORTNAME=	datasets
-DISTVERSION=	4.8.2
-PORTREVISION=	1
+DISTVERSION=	4.8.5
 CATEGORIES=	misc python # machine-learning
 MASTER_SITES=	PYPI
 PKGNAMEPREFIX=	${PYTHON_PKGNAMEPREFIX}
 
 MAINTAINER=	yuri@FreeBSD.org
 COMMENT=	HuggingFace community-driven open-source library of datasets
-WWW=		https://huggingface.co/docs/datasets/index
+WWW=		https://huggingface.co/docs/datasets/index \
+		https://github.com/huggingface/datasets
 
 LICENSE=	MIT
 LICENSE_FILE=	${WRKSRC}/LICENSE
 
 BUILD_DEPENDS=	${PYTHON_PKGNAMEPREFIX}pyproject-hooks>0:devel/py-pyproject-hooks@${PY_FLAVOR} \
 		${PY_SETUPTOOLS} \
 		${PYTHON_PKGNAMEPREFIX}wheel>0:devel/py-wheel@${PY_FLAVOR}
 RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}aiohttp>0:www/py-aiohttp@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}dill>0.3.0:devel/py-dill@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}filelock>0:sysutils/py-filelock@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}fsspec>=2023.1.0:filesystems/py-fsspec@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}httpx>0:www/py-httpx@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}huggingface-hub>=0.25.0:misc/py-huggingface-hub@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}multiprocess>0:devel/py-multiprocess@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}numpy1>=1.16:math/py-numpy1@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}packaging>0:devel/py-packaging@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}pandas>0:math/py-pandas@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}pyarrow>=21.0.0:databases/py-pyarrow@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}pyyaml>=5.1:devel/py-pyyaml@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}requests>=2.32.2:www/py-requests@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}tqdm>=4.66.3:misc/py-tqdm@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}xxhash>0:devel/py-xxhash@${PY_FLAVOR}
-RUN_DEPENDS_AUDIO= \
-		${PYTHON_PKGNAMEPREFIX}torchcodec>=0.6.0:multimedia/py-torchcodec@${PY_FLAVOR} \
+RUN_DEPENDS_AUDIO=	${PYTHON_PKGNAMEPREFIX}torchcodec>=0.6.0:multimedia/py-torchcodec@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}pytorch>=2.8.0:misc/py-pytorch@${PY_FLAVOR}
-RUN_DEPENDS_VISION= \
-		${PY_PILLOW}
+RUN_DEPENDS_VISION=	${PY_PILLOW}
 RUN_DEPENDS+=	${RUN_DEPENDS_AUDIO} \
 		${RUN_DEPENDS_VISION}
 TEST_DEPENDS=	${PYTHON_PKGNAMEPREFIX}absl-py>=0:devel/py-absl-py@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}elasticsearch>0:textproc/py-elasticsearch@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}faiss>=1.6.4:math/py-faiss@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}jax>=0.3.14:math/py-jax@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}jiwer>0:misc/py-jiwer@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}joblib>=1.3.0:devel/py-joblib@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}lz4>=0:archivers/py-lz4@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}polars>=0.20.0:misc/py-polars@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}protobuf>=4.0.0:devel/py-protobuf@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}pytest-datadir>=0:devel/py-pytest-datadir@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}pytest-xdist>=0:devel/py-pytest-xdist@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}pytest>=0:devel/py-pytest@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}pytorch>=2.0.0:misc/py-pytorch@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}rarfile>=4.0:archivers/py-rarfile@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}s3fs>=2021.11.1:filesystems/py-s3fs@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}tiktoken>=0:textproc/py-tiktoken@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}typing-extensions>=4.6.1:devel/py-typing-extensions@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}zstandard>=0:archivers/py-zstandard@${PY_FLAVOR}
-# missing TEST_DEPENDS: jaxlib, joblibspark, py7zr, pyspark, tensorflow
+# missing TEST_DEPENDS: jaxlib, joblibspark, py7zr, pyspark, tensorflow, transformers
 
 USES=		python
 USE_PYTHON=	pep517 concurrent autoplist pytest
 
-TEST_ENV=	${MAKE_ENV} PYTHONPATH=${STAGEDIR}${PYTHONPREFIX_SITELIBDIR}
+TEST_ENV=	${MAKE_ENV} PYTHONPATH=${STAGEDIR}${PYTHONPREFIX_SITELIBDIR} \
+		DATASETS_TEST_SKIP_TF=yes
+PYTEST_ARGS=	tests/
 
 NO_ARCH=	yes
 
-pre-test: # prevent failure due to missing pyspark
-	@${RM} ${WRKSRC}/tests/packaged_modules/test_spark.py
+pre-patch:
+	@${MKDIR} ${WRKSRC}/tests/fixtures
+	@${TOUCH} ${WRKSRC}/tests/__init__.py ${WRKSRC}/tests/fixtures/__init__.py
+
+pre-test: # skip tests requiring unavailable dependencies/data
+	@${RM} ${WRKSRC}/tests/test_fingerprint_tokenizer_stability.py
+	@${RM} ${WRKSRC}/tests/test_formatting.py
+	@${RM} ${WRKSRC}/tests/test_load.py
+	@${RM} ${WRKSRC}/tests/test_distributed.py
+	@${RM} ${WRKSRC}/tests/test_search.py
+	@${RM} ${WRKSRC}/tests/test_table.py
+	@${RM} ${WRKSRC}/tests/test_builder.py
+	@${RM} ${WRKSRC}/tests/test_parallel.py
+	@${RM} ${WRKSRC}/tests/test_iterable_dataset.py
+	@${RM} ${WRKSRC}/tests/test_upstream_hub.py
+	@${RM} ${WRKSRC}/tests/test_fingerprint.py
 
 .include <bsd.port.mk>
diff --git a/misc/py-datasets/distinfo b/misc/py-datasets/distinfo
index 612ac838107b..19c878e8b494 100644
--- a/misc/py-datasets/distinfo
+++ b/misc/py-datasets/distinfo
@@ -1,3 +1,3 @@
-TIMESTAMP = 1773758107
-SHA256 (datasets-4.8.2.tar.gz) = c6ad7e6c28c7436a9c6c23f817d1a450d395c771df881252dfe63697297cbcdf
-SIZE (datasets-4.8.2.tar.gz) = 603879
+TIMESTAMP = 1777403895
+SHA256 (datasets-4.8.5.tar.gz) = 0f0c1c3d56ffff2c93b2f4c63c95bac94f3d7e8621aea2a2a576275233bba772
+SIZE (datasets-4.8.5.tar.gz) = 605649
diff --git a/misc/py-datasets/files/patch-tests__test_patching.py b/misc/py-datasets/files/patch-tests__test_patching.py
new file mode 100644
index 000000000000..6beda41ed21b
--- /dev/null
+++ b/misc/py-datasets/files/patch-tests__test_patching.py
@@ -0,0 +1,17 @@
+-- This patch adds tests/_test_patching.py which is missing from the PyPI source distribution.
+-- The file is taken from the GitHub repository at the same version tag.
+-- Without this file, the test suite cannot be run.
+--- /dev/null
++++ tests/_test_patching.py
+@@ -0,0 +1,11 @@
++# ruff: noqa: F401
++# This is the module that test_patching.py uses to test patch_submodule()
++import os
++import os as renamed_os
++from os import path
++from os import path as renamed_path
++from os.path import join
++from os.path import join as renamed_join
++
++
++open = open  # we just need to have a builtin inside this module to test it properly
diff --git a/misc/py-datasets/files/patch-tests_conftest.py b/misc/py-datasets/files/patch-tests_conftest.py
new file mode 100644
index 000000000000..248e9b692e63
--- /dev/null
+++ b/misc/py-datasets/files/patch-tests_conftest.py
@@ -0,0 +1,68 @@
+-- This patch adds tests/conftest.py which is missing from the PyPI source distribution.
+-- The file is taken from the GitHub repository at the same version tag.
+-- Without this file, the test suite cannot be run.
+--- /dev/null
++++ tests/conftest.py
+@@ -0,0 +1,62 @@
++import pytest
++
++import datasets
++import datasets.config
++
++
++# Import fixture modules as plugins
++pytest_plugins = ["tests.fixtures.files", "tests.fixtures.hub", "tests.fixtures.fsspec"]
++
++
++def pytest_collection_modifyitems(config, items):
++    # Mark tests as "unit" by default if not marked as "integration" (or already marked as "unit")
++    for item in items:
++        if any(marker in item.keywords for marker in ["integration", "unit"]):
++            continue
++        item.add_marker(pytest.mark.unit)
++
++
++@pytest.fixture(autouse=True)
++def set_test_cache_config(tmp_path_factory, monkeypatch):
++    # test_hf_cache_home = tmp_path_factory.mktemp("cache")  # TODO: why a cache dir per test function does not work?
++    test_hf_cache_home = tmp_path_factory.getbasetemp() / "cache"
++    test_hf_datasets_cache = test_hf_cache_home / "datasets"
++    monkeypatch.setattr("datasets.config.HF_DATASETS_CACHE", str(test_hf_datasets_cache))
++    test_downloaded_datasets_path = test_hf_datasets_cache / "downloads"
++    monkeypatch.setattr("datasets.config.DOWNLOADED_DATASETS_PATH", str(test_downloaded_datasets_path))
++    test_extracted_datasets_path = test_hf_datasets_cache / "downloads" / "extracted"
++    monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_PATH", str(test_extracted_datasets_path))
++
++    # used in dataset viewer, we may set it to true by default in the future
++    monkeypatch.setattr("datasets.config.SAVE_ORIGINAL_SHARD_LENGTHS", True)
++
++
++@pytest.fixture(autouse=True)
++def disable_implicit_token(monkeypatch):
++    monkeypatch.setattr("huggingface_hub.constants.HF_HUB_DISABLE_IMPLICIT_TOKEN", True)
++
++
++@pytest.fixture(autouse=True, scope="session")
++def disable_tqdm_output():
++    datasets.disable_progress_bar()
++
++
++@pytest.fixture(autouse=True)
++def set_update_download_counts_to_false(monkeypatch):
++    # don't take tests into account when counting downloads
++    monkeypatch.setattr("datasets.config.HF_UPDATE_DOWNLOAD_COUNTS", False)
++
++
++@pytest.fixture
++def set_sqlalchemy_silence_uber_warning(monkeypatch):
++    # Required to suppress RemovedIn20Warning when feature(s) are not compatible with SQLAlchemy 2.0
++    # To be removed once SQLAlchemy 2.0 supported
++    try:
++        monkeypatch.setattr("sqlalchemy.util.deprecations.SILENCE_UBER_WARNING", True)
++    except (ModuleNotFoundError, AttributeError):
++        pass
++
++
++@pytest.fixture(autouse=True, scope="session")
++def zero_time_out_for_remote_code():
++    datasets.config.TIME_OUT_REMOTE_CODE = 0
diff --git a/misc/py-datasets/files/patch-tests_fixtures_files.py b/misc/py-datasets/files/patch-tests_fixtures_files.py
new file mode 100644
index 000000000000..7053267f2eaa
--- /dev/null
+++ b/misc/py-datasets/files/patch-tests_fixtures_files.py
@@ -0,0 +1,636 @@
+-- This patch adds tests/fixtures/files.py which is missing from the PyPI source distribution.
+-- The file is taken from the GitHub repository at the same version tag.
+-- Without this file, the test suite cannot be run.
+--- /dev/null
++++ tests/fixtures/files.py
+@@ -0,0 +1,630 @@
++import contextlib
++import csv
++import json
++import os
++import sqlite3
++import tarfile
++import textwrap
++import zipfile
++
++import pandas as pd
++import pyarrow as pa
++import pyarrow.parquet as pq
++import pytest
++
++import datasets
++import datasets.config
++
++
++# dataset + arrow_file
++
++
++@pytest.fixture(scope="session")
++def dataset():
++    n = 10
++    features = datasets.Features(
++        {
++            "tokens": datasets.List(datasets.Value("string")),
++            "labels": datasets.List(datasets.ClassLabel(names=["negative", "positive"])),
++            "answers": {
++                "text": datasets.List(datasets.Value("string")),
++                "answer_start": datasets.List(datasets.Value("int32")),
++            },
++            "id": datasets.Value("int64"),
++        }
++    )
++    dataset = datasets.Dataset.from_dict(
++        {
++            "tokens": [["foo"] * 5] * n,
++            "labels": [[1] * 5] * n,
++            "answers": [{"answer_start": [97], "text": ["1976"]}] * 10,
++            "id": list(range(n)),
++        },
++        features=features,
++    )
++    return dataset
++
++
++@pytest.fixture(scope="session")
++def arrow_file(tmp_path_factory, dataset):
++    filename = str(tmp_path_factory.mktemp("data") / "file.arrow")
++    dataset.map(cache_file_name=filename)
++    return filename
++
++
++# FILE_CONTENT + files
++
++
++FILE_CONTENT = """\
++    Text data.
++    Second line of data."""
++
++
++@pytest.fixture(scope="session")
++def text_file_content():
++    return FILE_CONTENT
++
++
++@pytest.fixture(scope="session")
++def text_file(tmp_path_factory):
++    filename = tmp_path_factory.mktemp("data") / "file.txt"
++    data = FILE_CONTENT
++    with open(filename, "w") as f:
++        f.write(data)
++    return filename
++
++
++@pytest.fixture(scope="session")
++def bz2_file(tmp_path_factory):
++    import bz2
++
++    path = tmp_path_factory.mktemp("data") / "file.txt.bz2"
++    data = bytes(FILE_CONTENT, "utf-8")
++    with bz2.open(path, "wb") as f:
++        f.write(data)
++    return path
++
++
++@pytest.fixture(scope="session")
++def gz_file(tmp_path_factory):
++    import gzip
++
++    path = str(tmp_path_factory.mktemp("data") / "file.txt.gz")
++    data = bytes(FILE_CONTENT, "utf-8")
++    with gzip.open(path, "wb") as f:
++        f.write(data)
++    return path
++
++
++@pytest.fixture(scope="session")
++def lz4_file(tmp_path_factory):
++    if datasets.config.LZ4_AVAILABLE:
++        import lz4.frame
++
++        path = tmp_path_factory.mktemp("data") / "file.txt.lz4"
++        data = bytes(FILE_CONTENT, "utf-8")
++        with lz4.frame.open(path, "wb") as f:
++            f.write(data)
++        return path
++
++
++@pytest.fixture(scope="session")
++def seven_zip_file(tmp_path_factory, text_file):
++    if datasets.config.PY7ZR_AVAILABLE:
++        import py7zr
++
++        path = tmp_path_factory.mktemp("data") / "file.txt.7z"
++        with py7zr.SevenZipFile(path, "w") as archive:
++            archive.write(text_file, arcname=os.path.basename(text_file))
++        return path
++
++
++@pytest.fixture(scope="session")
++def tar_file(tmp_path_factory, text_file):
++    import tarfile
++
++    path = tmp_path_factory.mktemp("data") / "file.txt.tar"
++    with tarfile.TarFile(path, "w") as f:
++        f.add(text_file, arcname=os.path.basename(text_file))
++    return path
++
++
++@pytest.fixture(scope="session")
++def xz_file(tmp_path_factory):
++    import lzma
++
++    path = tmp_path_factory.mktemp("data") / "file.txt.xz"
++    data = bytes(FILE_CONTENT, "utf-8")
++    with lzma.open(path, "wb") as f:
++        f.write(data)
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_file(tmp_path_factory, text_file):
++    import zipfile
++
++    path = tmp_path_factory.mktemp("data") / "file.txt.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(text_file, arcname=os.path.basename(text_file))
++    return path
++
++
++@pytest.fixture(scope="session")
++def zstd_file(tmp_path_factory):
++    if datasets.config.ZSTANDARD_AVAILABLE:
++        import zstandard as zstd
++
++        path = tmp_path_factory.mktemp("data") / "file.txt.zst"
++        data = bytes(FILE_CONTENT, "utf-8")
++        with zstd.open(path, "wb") as f:
++            f.write(data)
++        return path
++
++
++# xml_file
++
++
++@pytest.fixture(scope="session")
++def xml_file(tmp_path_factory):
++    filename = tmp_path_factory.mktemp("data") / "file.xml"
++    data = textwrap.dedent(
++        """\
++    <?xml version="1.0" encoding="UTF-8" ?>
++    <tmx version="1.4">
++      <header segtype="sentence" srclang="ca" />
++      <body>
++        <tu>
++          <tuv xml:lang="ca"><seg>Contingut 1</seg></tuv>
++          <tuv xml:lang="en"><seg>Content 1</seg></tuv>
++        </tu>
++        <tu>
++          <tuv xml:lang="ca"><seg>Contingut 2</seg></tuv>
++          <tuv xml:lang="en"><seg>Content 2</seg></tuv>
++        </tu>
++        <tu>
++          <tuv xml:lang="ca"><seg>Contingut 3</seg></tuv>
++          <tuv xml:lang="en"><seg>Content 3</seg></tuv>
++        </tu>
++        <tu>
++          <tuv xml:lang="ca"><seg>Contingut 4</seg></tuv>
++          <tuv xml:lang="en"><seg>Content 4</seg></tuv>
++        </tu>
++        <tu>
++          <tuv xml:lang="ca"><seg>Contingut 5</seg></tuv>
++          <tuv xml:lang="en"><seg>Content 5</seg></tuv>
++        </tu>
++      </body>
++    </tmx>"""
++    )
++    with open(filename, "w") as f:
++        f.write(data)
++    return filename
++
++
++DATA = [
++    {"col_1": "0", "col_2": 0, "col_3": 0.0},
++    {"col_1": "1", "col_2": 1, "col_3": 1.0},
++    {"col_1": "2", "col_2": 2, "col_3": 2.0},
++    {"col_1": "3", "col_2": 3, "col_3": 3.0},
++]
++DATA2 = [
++    {"col_1": "4", "col_2": 4, "col_3": 4.0},
++    {"col_1": "5", "col_2": 5, "col_3": 5.0},
++]
++DATA_DICT_OF_LISTS = {
++    "col_1": ["0", "1", "2", "3"],
++    "col_2": [0, 1, 2, 3],
++    "col_3": [0.0, 1.0, 2.0, 3.0],
++}
++
++DATA_312 = [
++    {"col_3": 0.0, "col_1": "0", "col_2": 0},
++    {"col_3": 1.0, "col_1": "1", "col_2": 1},
++]
++
++DATA_STR = [
++    {"col_1": "s0", "col_2": 0, "col_3": 0.0},
++    {"col_1": "s1", "col_2": 1, "col_3": 1.0},
++    {"col_1": "s2", "col_2": 2, "col_3": 2.0},
++    {"col_1": "s3", "col_2": 3, "col_3": 3.0},
++]
++
++DATA_MISSING_FIELDS = [
++    {"col_1": 1, "col_2": 2},
++    {"col_1": 1, "col_3": 3},
++]
++
++DATA_MIXED_TYPES = [
++    {"col_1": 1, "col_2": {"a": "a"}, "col_3": [{"x": "x"}]},
++    {"col_1": "one", "col_2": {"b": "b"}, "col_3": [{"y": "y"}]},
++    {"col_1": None, "col_2": None, "col_3": [None]},
++]
++
++
++@pytest.fixture(scope="session")
++def dataset_dict():
++    return DATA_DICT_OF_LISTS
++
++
++@pytest.fixture(scope="session")
++def arrow_path(tmp_path_factory):
++    dataset = datasets.Dataset.from_dict(DATA_DICT_OF_LISTS)
++    path = str(tmp_path_factory.mktemp("data") / "dataset.arrow")
++    dataset.map(cache_file_name=path)
++    return path
++
++
++@pytest.fixture(scope="session")
++def sqlite_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset.sqlite")
++    with contextlib.closing(sqlite3.connect(path)) as con:
++        cur = con.cursor()
++        cur.execute("CREATE TABLE dataset(col_1 text, col_2 int, col_3 real)")
++        for item in DATA:
++            cur.execute("INSERT INTO dataset(col_1, col_2, col_3) VALUES (?, ?, ?)", tuple(item.values()))
++        con.commit()
++    return path
++
++
++@pytest.fixture(scope="session")
++def csv_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset.csv")
++    with open(path, "w", newline="") as f:
++        writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"])
++        writer.writeheader()
++        for item in DATA:
++            writer.writerow(item)
++    return path
++
++
++@pytest.fixture(scope="session")
++def csv2_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset2.csv")
++    with open(path, "w", newline="") as f:
++        writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"])
++        writer.writeheader()
++        for item in DATA:
++            writer.writerow(item)
++    return path
++
++
++@pytest.fixture(scope="session")
++def bz2_csv_path(csv_path, tmp_path_factory):
++    import bz2
++
++    path = tmp_path_factory.mktemp("data") / "dataset.csv.bz2"
++    with open(csv_path, "rb") as f:
++        data = f.read()
++    # data = bytes(FILE_CONTENT, "utf-8")
++    with bz2.open(path, "wb") as f:
++        f.write(data)
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_csv_path(csv_path, csv2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("zip_csv_path") / "csv-dataset.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(csv_path, arcname=os.path.basename(csv_path))
++        f.write(csv2_path, arcname=os.path.basename(csv2_path))
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_uppercase_csv_path(csv_path, csv2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset.csv.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(csv_path, arcname=os.path.basename(csv_path.replace(".csv", ".CSV")))
++        f.write(csv2_path, arcname=os.path.basename(csv2_path.replace(".csv", ".CSV")))
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_csv_with_dir_path(csv_path, csv2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset_with_dir.csv.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(csv_path, arcname=os.path.join("main_dir", os.path.basename(csv_path)))
++        f.write(csv2_path, arcname=os.path.join("main_dir", os.path.basename(csv2_path)))
++    return path
++
++
++@pytest.fixture(scope="session")
++def parquet_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset.parquet")
++    schema = pa.schema(
++        {
++            "col_1": pa.string(),
++            "col_2": pa.int64(),
++            "col_3": pa.float64(),
++        }
++    )
++    with open(path, "wb") as f:
++        writer = pq.ParquetWriter(f, schema=schema)
++        pa_table = pa.Table.from_pydict({k: [DATA[i][k] for i in range(len(DATA))] for k in DATA[0]}, schema=schema)
++        writer.write_table(pa_table)
++        writer.close()
++    return path
++
++
++@pytest.fixture(scope="session")
++def geoparquet_path(tmp_path_factory):
++    df = pd.read_parquet(path="https://github.com/opengeospatial/geoparquet/raw/v1.0.0/examples/example.parquet")
++    path = str(tmp_path_factory.mktemp("data") / "dataset.geoparquet")
++    df.to_parquet(path=path)
++    return path
++
++
++@pytest.fixture(scope="session")
++def json_list_of_dicts_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset.json")
++    data = {"data": DATA}
++    with open(path, "w") as f:
++        json.dump(data, f)
++    return path
++
++
++@pytest.fixture(scope="session")
++def json_dict_of_lists_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset.json")
++    data = {"data": DATA_DICT_OF_LISTS}
++    with open(path, "w") as f:
++        json.dump(data, f)
++    return path
++
++
++@pytest.fixture(scope="session")
++def jsonl_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl")
++    with open(path, "w") as f:
++        for item in DATA:
++            f.write(json.dumps(item) + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def jsonl2_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset2.jsonl")
++    with open(path, "w") as f:
++        for item in DATA:
++            f.write(json.dumps(item) + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def jsonl_312_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset_312.jsonl")
++    with open(path, "w") as f:
++        for item in DATA_312:
++            f.write(json.dumps(item) + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def jsonl_str_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset-str.jsonl")
++    with open(path, "w") as f:
++        for item in DATA_STR:
++            f.write(json.dumps(item) + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def jsonl_missing_fields_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset-missing-fields.jsonl")
++    with open(path, "w") as f:
++        for item in DATA_MISSING_FIELDS:
++            f.write(json.dumps(item) + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def jsonl_mixed_types_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset-mixed-types.jsonl")
++    with open(path, "w") as f:
++        for item in DATA_MIXED_TYPES:
++            f.write(json.dumps(item) + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def text_gz_path(tmp_path_factory, text_path):
++    import gzip
++
++    path = str(tmp_path_factory.mktemp("data") / "dataset.txt.gz")
++    with open(text_path, "rb") as orig_file:
++        with gzip.open(path, "wb") as zipped_file:
++            zipped_file.writelines(orig_file)
++    return path
++
++
++@pytest.fixture(scope="session")
++def jsonl_gz_path(tmp_path_factory, jsonl_path):
++    import gzip
++
++    path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl.gz")
++    with open(jsonl_path, "rb") as orig_file:
++        with gzip.open(path, "wb") as zipped_file:
++            zipped_file.writelines(orig_file)
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_jsonl_path(jsonl_path, jsonl2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset.jsonl.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(jsonl_path, arcname=os.path.basename(jsonl_path))
++        f.write(jsonl2_path, arcname=os.path.basename(jsonl2_path))
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_nested_jsonl_path(zip_jsonl_path, jsonl_path, jsonl2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset_nested.jsonl.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(zip_jsonl_path, arcname=os.path.join("nested", os.path.basename(zip_jsonl_path)))
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_jsonl_with_dir_path(jsonl_path, jsonl2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset_with_dir.jsonl.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(jsonl_path, arcname=os.path.join("main_dir", os.path.basename(jsonl_path)))
++        f.write(jsonl2_path, arcname=os.path.join("main_dir", os.path.basename(jsonl2_path)))
++    return path
++
++
++@pytest.fixture(scope="session")
++def tar_jsonl_path(jsonl_path, jsonl2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset.jsonl.tar"
++    with tarfile.TarFile(path, "w") as f:
++        f.add(jsonl_path, arcname=os.path.basename(jsonl_path))
++        f.add(jsonl2_path, arcname=os.path.basename(jsonl2_path))
++    return path
++
++
++@pytest.fixture(scope="session")
++def tar_nested_jsonl_path(tar_jsonl_path, jsonl_path, jsonl2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset_nested.jsonl.tar"
++    with tarfile.TarFile(path, "w") as f:
++        f.add(tar_jsonl_path, arcname=os.path.join("nested", os.path.basename(tar_jsonl_path)))
++    return path
++
++
++@pytest.fixture(scope="session")
++def text_path(tmp_path_factory):
++    data = ["0", "1", "2", "3"]
++    path = str(tmp_path_factory.mktemp("data") / "dataset.txt")
++    with open(path, "w") as f:
++        for item in data:
++            f.write(item + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def text2_path(tmp_path_factory):
++    data = ["0", "1", "2", "3"]
++    path = str(tmp_path_factory.mktemp("data") / "dataset2.txt")
++    with open(path, "w") as f:
++        for item in data:
++            f.write(item + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def text_dir(tmp_path_factory):
++    data = ["0", "1", "2", "3"]
++    path = tmp_path_factory.mktemp("data_text_dir") / "dataset.txt"
++    with open(path, "w") as f:
++        for item in data:
++            f.write(item + "\n")
++    return path.parent
++
++
++@pytest.fixture(scope="session")
++def text_dir_with_unsupported_extension(tmp_path_factory):
++    data = ["0", "1", "2", "3"]
++    path = tmp_path_factory.mktemp("data") / "dataset.abc"
++    with open(path, "w") as f:
++        for item in data:
++            f.write(item + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_text_path(text_path, text2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset.text.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(text_path, arcname=os.path.basename(text_path))
++        f.write(text2_path, arcname=os.path.basename(text2_path))
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_text_with_dir_path(text_path, text2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset_with_dir.text.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(text_path, arcname=os.path.join("main_dir", os.path.basename(text_path)))
++        f.write(text2_path, arcname=os.path.join("main_dir", os.path.basename(text2_path)))
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_unsupported_ext_path(text_path, text2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset.ext.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(text_path, arcname=os.path.basename("unsupported.ext"))
++        f.write(text2_path, arcname=os.path.basename("unsupported_2.ext"))
++    return path
++
++
++@pytest.fixture(scope="session")
++def text_path_with_unicode_new_lines(tmp_path_factory):
++    text = "\n".join(["First", "Second\u2029with Unicode new line", "Third"])
++    path = str(tmp_path_factory.mktemp("data") / "dataset_with_unicode_new_lines.txt")
++    with open(path, "w", encoding="utf-8") as f:
++        f.write(text)
++    return path
++
++
++@pytest.fixture(scope="session")
++def image_file():
++    return os.path.join("tests", "features", "data", "test_image_rgb.jpg")
++
++
++@pytest.fixture(scope="session")
++def audio_file():
++    return os.path.join("tests", "features", "data", "test_audio_44100.wav")
++
++
++@pytest.fixture(scope="session")
++def audio_file_44100():
++    return os.path.join("tests", "features", "data", "test_audio_44100.mp3")
++
++
++@pytest.fixture(scope="session")
++def audio_file_16000():
++    return os.path.join("tests", "features", "data", "test_audio_16000.mp3")
++
++
++@pytest.fixture(scope="session")
++def tensor_file(tmp_path_factory):
++    import torch
++
++    path = tmp_path_factory.mktemp("data") / "tensor.pth"
++    with open(path, "wb") as f:
++        torch.save(torch.ones(128), f)
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_image_path(image_file, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset.img.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(image_file, arcname=os.path.basename(image_file))
++        f.write(image_file, arcname=os.path.basename(image_file).replace(".jpg", "2.jpg"))
++    return path
++
++
++@pytest.fixture(scope="session")
++def data_dir_with_hidden_files(tmp_path_factory):
++    data_dir = tmp_path_factory.mktemp("data_dir")
++
++    (data_dir / "subdir").mkdir()
++    with open(data_dir / "subdir" / "train.txt", "w") as f:
++        f.write("foo\n" * 10)
++    with open(data_dir / "subdir" / "test.txt", "w") as f:
++        f.write("bar\n" * 10)
++    # hidden file
++    with open(data_dir / "subdir" / ".test.txt", "w") as f:
++        f.write("bar\n" * 10)
++
++    # hidden directory
++    (data_dir / ".subdir").mkdir()
++    with open(data_dir / ".subdir" / "train.txt", "w") as f:
++        f.write("foo\n" * 10)
++    with open(data_dir / ".subdir" / "test.txt", "w") as f:
++        f.write("bar\n" * 10)
++
++    return data_dir
diff --git a/misc/py-datasets/files/patch-tests_fixtures_fsspec.py b/misc/py-datasets/files/patch-tests_fixtures_fsspec.py
new file mode 100644
index 000000000000..311541e7a5dd
--- /dev/null
+++ b/misc/py-datasets/files/patch-tests_fixtures_fsspec.py
@@ -0,0 +1,119 @@
+-- This patch adds tests/fixtures/fsspec.py which is missing from the PyPI source distribution.
+-- The file is taken from the GitHub repository at the same version tag.
+-- Without this file, the test suite cannot be run.
+--- /dev/null
++++ tests/fixtures/fsspec.py
+@@ -0,0 +1,113 @@
++import posixpath
++from pathlib import Path
++from unittest.mock import patch
++
++import pytest
++from fsspec.implementations.local import AbstractFileSystem, LocalFileSystem, stringify_path
++from fsspec.registry import _registry as _fsspec_registry
++
++
++class MockFileSystem(AbstractFileSystem):
++    protocol = "mock"
++
++    def __init__(self, *args, local_root_dir, **kwargs):
++        super().__init__()
++        self._fs = LocalFileSystem(*args, **kwargs)
++        self.local_root_dir = Path(local_root_dir).resolve().as_posix() + "/"
++
++    def mkdir(self, path, *args, **kwargs):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        return self._fs.mkdir(path, *args, **kwargs)
++
++    def makedirs(self, path, *args, **kwargs):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        return self._fs.makedirs(path, *args, **kwargs)
++
++    def rmdir(self, path):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        return self._fs.rmdir(path)
++
++    def ls(self, path, detail=True, *args, **kwargs):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        out = self._fs.ls(path, detail=detail, *args, **kwargs)
++        if detail:
++            return [{**info, "name": info["name"][len(self.local_root_dir) :]} for info in out]
++        else:
++            return [name[len(self.local_root_dir) :] for name in out]
++
++    def info(self, path, *args, **kwargs):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        out = dict(self._fs.info(path, *args, **kwargs))
++        out["name"] = out["name"][len(self.local_root_dir) :]
++        return out
++
++    def cp_file(self, path1, path2, *args, **kwargs):
++        path1 = posixpath.join(self.local_root_dir, self._strip_protocol(path1))
++        path2 = posixpath.join(self.local_root_dir, self._strip_protocol(path2))
++        return self._fs.cp_file(path1, path2, *args, **kwargs)
++
++    def rm_file(self, path, *args, **kwargs):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        return self._fs.rm_file(path, *args, **kwargs)
++
++    def rm(self, path, *args, **kwargs):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        return self._fs.rm(path, *args, **kwargs)
++
++    def _open(self, path, *args, **kwargs):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        return self._fs._open(path, *args, **kwargs)
++
++    def created(self, path):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        return self._fs.created(path)
++
++    def modified(self, path):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        return self._fs.modified(path)
++
++    @classmethod
++    def _strip_protocol(cls, path):
++        path = stringify_path(path)
++        if path.startswith("mock://"):
++            path = path[7:]
++        return path
++
++
++class TmpDirFileSystem(MockFileSystem):
++    protocol = "tmp"
++    tmp_dir = None
++
++    def __init__(self, *args, **kwargs):
++        assert self.tmp_dir is not None, "TmpDirFileSystem.tmp_dir is not set"
++        super().__init__(*args, **kwargs, local_root_dir=self.tmp_dir, auto_mkdir=True)
++
++    @classmethod
++    def _strip_protocol(cls, path):
++        path = stringify_path(path)
++        if path.startswith("tmp://"):
++            path = path[6:]
++        return path
++
++
++@pytest.fixture
++def mock_fsspec():
++    _fsspec_registry["mock"] = MockFileSystem
++    _fsspec_registry["tmp"] = TmpDirFileSystem
++    yield
++    del _fsspec_registry["mock"]
++    del _fsspec_registry["tmp"]
++
++
++@pytest.fixture
++def mockfs(tmp_path_factory, mock_fsspec):
++    local_fs_dir = tmp_path_factory.mktemp("mockfs")
++    return MockFileSystem(local_root_dir=local_fs_dir, auto_mkdir=True)
++
++
++@pytest.fixture
++def tmpfs(tmp_path_factory, mock_fsspec):
++    tmp_fs_dir = tmp_path_factory.mktemp("tmpfs")
++    with patch.object(TmpDirFileSystem, "tmp_dir", tmp_fs_dir):
++        yield TmpDirFileSystem()
++        TmpDirFileSystem.clear_instance_cache()
diff --git a/misc/py-datasets/files/patch-tests_fixtures_hub.py b/misc/py-datasets/files/patch-tests_fixtures_hub.py
new file mode 100644
index 000000000000..771dd0d56344
--- /dev/null
+++ b/misc/py-datasets/files/patch-tests_fixtures_hub.py
@@ -0,0 +1,235 @@
+-- This patch adds tests/fixtures/hub.py which is missing from the PyPI source distribution.
+-- The file is taken from the GitHub repository at the same version tag.
+-- Without this file, the test suite cannot be run.
+--- /dev/null
++++ tests/fixtures/hub.py
+@@ -0,0 +1,229 @@
++import os
++import time
++import uuid
++from contextlib import contextmanager
++from typing import Optional
++
++import pytest
++from huggingface_hub.hf_api import HfApi
++from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError
++from huggingface_hub.utils._headers import _http_user_agent
++from packaging import version
++
++from datasets import config
++
++
++if config.HF_HUB_VERSION >= version.parse("1.6.0"):
++    from huggingface_hub.errors import BucketNotFoundError
++
++else:
++    BucketNotFoundError = None
++
++CI_HUB_USER = "__DUMMY_DATASETS_USER__"
++CI_HUB_USER_FULL_NAME = "Dummy User"
++CI_HUB_USER_TOKEN = "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"
++
++CI_HUB_ENDPOINT = "https://hub-ci.huggingface.co"
++CI_HUB_DATASETS_URL = CI_HUB_ENDPOINT + "/datasets/{repo_id}/resolve/{revision}/{path}"
++CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE = CI_HUB_ENDPOINT + "/{repo_id}/resolve/{revision}/{filename}"
++
++
++@pytest.fixture
++def ci_hub_config(monkeypatch):
++    monkeypatch.setattr("datasets.config.HF_ENDPOINT", CI_HUB_ENDPOINT)
++    monkeypatch.setattr("datasets.config.HUB_DATASETS_URL", CI_HUB_DATASETS_URL)
++    monkeypatch.setattr("huggingface_hub.constants.HUGGINGFACE_CO_URL_TEMPLATE", CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE)
++    try:
++        # for backward compatibility with huggingface_hub 0.x
++        monkeypatch.setattr(
++            "huggingface_hub.file_download.HUGGINGFACE_CO_URL_TEMPLATE", CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE
++        )
++    except AttributeError:
++        pass
++    old_environ = dict(os.environ)
++    os.environ["HF_ENDPOINT"] = CI_HUB_ENDPOINT
++    yield
++    os.environ.clear()
++    os.environ.update(old_environ)
++
++
++@pytest.fixture
++def set_ci_hub_access_token(ci_hub_config, monkeypatch):
++    # Enable implicit token
++    monkeypatch.setattr("huggingface_hub.constants.HF_HUB_DISABLE_IMPLICIT_TOKEN", False)
++    old_environ = dict(os.environ)
++    os.environ["HF_TOKEN"] = CI_HUB_USER_TOKEN
++    os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "0"
++    yield
++    os.environ.clear()
++    os.environ.update(old_environ)
++
++
++def _http_ci_user_agent(*args, **kwargs):
++    ua = _http_user_agent(*args, **kwargs)
++    return ua + os.environ.get("CI_HEADERS", "")
++
++
++@pytest.fixture(autouse=True)
++def set_hf_ci_headers(monkeypatch):
++    old_environ = dict(os.environ)
++    os.environ["TRANSFORMERS_IS_CI"] = "1"
++    monkeypatch.setattr("huggingface_hub.utils._headers._http_user_agent", _http_ci_user_agent)
++    yield
++    os.environ.clear()
++    os.environ.update(old_environ)
++
++
++@pytest.fixture(scope="session")
++def hf_api():
++    return HfApi(endpoint=CI_HUB_ENDPOINT)
++
++
++@pytest.fixture(scope="session")
++def hf_token():
++    yield CI_HUB_USER_TOKEN
++
++
++@pytest.fixture
++def cleanup_repo(hf_api: HfApi):
++    def _cleanup_repo(repo_id):
++        hf_api.delete_repo(repo_id, token=CI_HUB_USER_TOKEN, repo_type="dataset")
++
++    return _cleanup_repo
++
++
++@pytest.fixture
++def cleanup_bucket(hf_api: HfApi):
++    def _cleanup_bucket(bucket_id):
++        hf_api.delete_bucket(bucket_id, token=CI_HUB_USER_TOKEN)
++
++    return _cleanup_bucket
++
++
++@pytest.fixture
++def temporary_repo(cleanup_repo):
++    @contextmanager
++    def _temporary_repo(repo_id: Optional[str] = None):
++        repo_id = repo_id or f"{CI_HUB_USER}/test-dataset-{uuid.uuid4().hex[:6]}-{int(time.time() * 10e3)}"
++        try:
++            yield repo_id
++        finally:
++            try:
++                cleanup_repo(repo_id)
++            except RepositoryNotFoundError:
++                pass
++
++    return _temporary_repo
++
++
++@pytest.fixture
++def temporary_bucket(cleanup_bucket):
++    @contextmanager
++    def _temporary_bucket(bucket_id: Optional[str] = None):
++        bucket_id = bucket_id or f"{CI_HUB_USER}/test-bucket-{uuid.uuid4().hex[:6]}-{int(time.time() * 10e3)}"
++        try:
++            yield bucket_id
++        finally:
++            try:
++                cleanup_bucket(bucket_id)
++            except BucketNotFoundError:
++                pass
++
++    return _temporary_bucket
++
++
++@pytest.fixture(scope="session")
++def _hf_gated_dataset_repo_txt_data(hf_api: HfApi, hf_token, text_file_content):
++    repo_name = f"repo_txt_data-{int(time.time() * 10e6)}"
++    repo_id = f"{CI_HUB_USER}/{repo_name}"
++    hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset")
++    hf_api.upload_file(
++        token=hf_token,
++        path_or_fileobj=text_file_content.encode(),
++        path_in_repo="data/text_data.txt",
++        repo_id=repo_id,
++        repo_type="dataset",
++    )
++    hf_api.update_repo_settings(repo_id, token=hf_token, repo_type="dataset", gated="auto")
++    yield repo_id
++    try:
++        hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset")
++    except (HfHubHTTPError, ValueError):  # catch http error and token invalid error
++        pass
++
++
++@pytest.fixture()
++def hf_gated_dataset_repo_txt_data(_hf_gated_dataset_repo_txt_data, ci_hub_config):
++    return _hf_gated_dataset_repo_txt_data
++
++
++@pytest.fixture(scope="session")
++def hf_private_dataset_repo_txt_data_(hf_api: HfApi, hf_token, text_file_content):
++    repo_name = f"repo_txt_data-{int(time.time() * 10e6)}"
++    repo_id = f"{CI_HUB_USER}/{repo_name}"
++    hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset", private=True)
++    hf_api.upload_file(
++        token=hf_token,
++        path_or_fileobj=text_file_content.encode(),
++        path_in_repo="data/text_data.txt",
++        repo_id=repo_id,
++        repo_type="dataset",
++    )
++    yield repo_id
++    try:
++        hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset")
++    except (HfHubHTTPError, ValueError):  # catch http error and token invalid error
++        pass
++
++
++@pytest.fixture()
++def hf_private_dataset_repo_txt_data(hf_private_dataset_repo_txt_data_, ci_hub_config):
++    return hf_private_dataset_repo_txt_data_
++
++
++@pytest.fixture(scope="session")
++def hf_private_dataset_repo_zipped_txt_data_(hf_api: HfApi, hf_token, zip_csv_with_dir_path):
++    repo_name = f"repo_zipped_txt_data-{int(time.time() * 10e6)}"
++    repo_id = f"{CI_HUB_USER}/{repo_name}"
++    hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset", private=True)
++    hf_api.upload_file(
++        token=hf_token,
++        path_or_fileobj=str(zip_csv_with_dir_path),
++        path_in_repo="data.zip",
++        repo_id=repo_id,
++        repo_type="dataset",
++    )
++    yield repo_id
++    try:
++        hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset")
++    except (HfHubHTTPError, ValueError):  # catch http error and token invalid error
++        pass
++
++
++@pytest.fixture()
++def hf_private_dataset_repo_zipped_txt_data(hf_private_dataset_repo_zipped_txt_data_, ci_hub_config):
++    return hf_private_dataset_repo_zipped_txt_data_
++
++
++@pytest.fixture(scope="session")
++def hf_private_dataset_repo_zipped_img_data_(hf_api: HfApi, hf_token, zip_image_path):
++    repo_name = f"repo_zipped_img_data-{int(time.time() * 10e6)}"
++    repo_id = f"{CI_HUB_USER}/{repo_name}"
++    hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset", private=True)
++    hf_api.upload_file(
++        token=hf_token,
++        path_or_fileobj=str(zip_image_path),
++        path_in_repo="data.zip",
++        repo_id=repo_id,
++        repo_type="dataset",
++    )
++    yield repo_id
++    try:
++        hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset")
++    except (HfHubHTTPError, ValueError):  # catch http error and token invalid error
++        pass
++
++
++@pytest.fixture()
++def hf_private_dataset_repo_zipped_img_data(hf_private_dataset_repo_zipped_img_data_, ci_hub_config):
++    return hf_private_dataset_repo_zipped_img_data_
diff --git a/misc/py-datasets/files/patch-tests_utils.py b/misc/py-datasets/files/patch-tests_utils.py
new file mode 100644
index 000000000000..fa46c80d083f
--- /dev/null
+++ b/misc/py-datasets/files/patch-tests_utils.py
@@ -0,0 +1,626 @@
+-- This patch adds tests/utils.py which is missing from the PyPI source distribution.
+-- The file is taken from the GitHub repository at the same version tag (4.8.5).
+-- Without this file, the test suite cannot be run.
+--- /dev/null
++++ tests/utils.py
+@@ -0,0 +1,620 @@
++import asyncio
++import importlib.metadata
++import os
++import re
++import sys
++import tempfile
++import unittest
++from contextlib import contextmanager
++from copy import deepcopy
++from distutils.util import strtobool
++from enum import Enum
++from importlib.util import find_spec
++from pathlib import Path
++from unittest.mock import Mock, patch
++
++import httpx
++import pyarrow as pa
++import pytest
++import requests
++from packaging import version
++
++from datasets import config
++
++
++def parse_flag_from_env(key, default=False):
++    try:
++        value = os.environ[key]
++    except KeyError:
++        # KEY isn't set, default to `default`.
++        _value = default
++    else:
++        # KEY is set, convert it to True or False.
++        try:
++            _value = strtobool(value)
++        except ValueError:
++            # More values are supported, but let's keep the message simple.
++            raise ValueError(f"If set, {key} must be yes or no.")
++    return _value
++
++
++_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
++_run_remote_tests = parse_flag_from_env("RUN_REMOTE", default=False)
++_run_local_tests = parse_flag_from_env("RUN_LOCAL", default=True)
++_run_packaged_tests = parse_flag_from_env("RUN_PACKAGED", default=True)
++
++# Compression
++require_lz4 = pytest.mark.skipif(not config.LZ4_AVAILABLE, reason="test requires lz4")
++require_py7zr = pytest.mark.skipif(not config.PY7ZR_AVAILABLE, reason="test requires py7zr")
++require_zstandard = pytest.mark.skipif(not config.ZSTANDARD_AVAILABLE, reason="test requires zstandard")
++
++# Dill-cloudpickle compatibility
++require_dill_gt_0_3_2 = pytest.mark.skipif(
++    config.DILL_VERSION <= version.parse("0.3.2"),
++    reason="test requires dill>0.3.2 for cloudpickle compatibility",
++)
++
++# Windows
++require_not_windows = pytest.mark.skipif(
++    sys.platform == "win32",
++    reason="test should not be run on Windows",
++)
++
++
++require_faiss = pytest.mark.skipif(find_spec("faiss") is None or sys.platform == "win32", reason="test requires faiss")
++require_moto = pytest.mark.skipif(find_spec("moto") is None, reason="test requires moto")
++require_numpy1_on_windows = pytest.mark.skipif(
++    version.parse(importlib.metadata.version("numpy")) >= version.parse("2.0.0") and sys.platform == "win32",
++    reason="test requires numpy < 2.0 on windows",
++)
++
++IS_HF_HUB_1_x = config.HF_HUB_VERSION >= version.parse("0.99")  # clunky but works with pre-releases
++
++
++def require_buckets_support_in_huggingface_hub(test_case):
++    """
++    Decorator marking a test that requires buckets support in huggingface_hub.
++
++    These tests are skipped when huggingface_hub's version doesn't support buckets.
++
++    """
++    try:
++        from huggingface_hub.utils import BucketNotFoundError  # noqa
++    except ImportError:
++        test_case = unittest.skip("test requires buckets support in huggingface_hub")(test_case)
++    return test_case
++
++
++def require_regex(test_case):
++    """
++    Decorator marking a test that requires regex.
++
++    These tests are skipped when Regex isn't installed.
++
++    """
++    try:
++        import regex  # noqa
++    except ImportError:
++        test_case = unittest.skip("test requires regex")(test_case)
++    return test_case
++
++
++def require_elasticsearch(test_case):
++    """
++    Decorator marking a test that requires ElasticSearch.
++
++    These tests are skipped when ElasticSearch isn't installed.
++
++    """
++    try:
++        import elasticsearch  # noqa
++    except ImportError:
++        test_case = unittest.skip("test requires elasticsearch")(test_case)
++    return test_case
++
++
++def require_sqlalchemy(test_case):
++    """
++    Decorator marking a test that requires SQLAlchemy.
++
++    These tests are skipped when SQLAlchemy isn't installed.
++
++    """
++    try:
++        import sqlalchemy  # noqa
++    except ImportError:
++        test_case = unittest.skip("test requires sqlalchemy")(test_case)
++    return test_case
++
++
++def require_torch(test_case):
++    """
++    Decorator marking a test that requires PyTorch.
++
++    These tests are skipped when PyTorch isn't installed.
++
++    """
++    if not config.TORCH_AVAILABLE:
++        test_case = unittest.skip("test requires PyTorch")(test_case)
++    return test_case
++
++
++def require_torch_compile(test_case):
++    """
++    Decorator marking a test that requires PyTorch.
++
++    These tests are skipped when PyTorch isn't installed.
++
++    """
++    if not config.TORCH_AVAILABLE:
++        test_case = unittest.skip("test requires PyTorch")(test_case)
++    if config.PY_VERSION >= version.parse("3.14"):
++        test_case = unittest.skip("test requires torch compile which isn't available in python 3.14")(test_case)
++    return test_case
++
++
++def require_polars(test_case):
++    """
++    Decorator marking a test that requires Polars.
++
++    These tests are skipped when Polars isn't installed.
++
++    """
++    if not config.POLARS_AVAILABLE:
++        test_case = unittest.skip("test requires Polars")(test_case)
++    return test_case
++
++
++def require_tf(test_case):
++    """
++    Decorator marking a test that requires TensorFlow.
++
++    These tests are skipped when TensorFlow isn't installed.
++
++    """
++    if not config.TF_AVAILABLE or os.environ.get("DATASETS_TEST_SKIP_TF"):
++        test_case = unittest.skip("test requires TensorFlow")(test_case)
++    return test_case
++
++
++def require_jax(test_case):
++    """
++    Decorator marking a test that requires JAX.
++
++    These tests are skipped when JAX isn't installed.
++
++    """
++    if not config.JAX_AVAILABLE:
++        test_case = unittest.skip("test requires JAX")(test_case)
++    return test_case
++
++
++def require_pil(test_case):
++    """
++    Decorator marking a test that requires Pillow.
++
++    These tests are skipped when Pillow isn't installed.
++
++    """
++    if not config.PIL_AVAILABLE:
++        test_case = unittest.skip("test requires Pillow")(test_case)
++    return test_case
++
++
++def require_torchvision(test_case):
++    """
++    Decorator marking a test that requires torchvision.
++
++    These tests are skipped when torchvision isn't installed.
++
++    """
++    if not config.TORCHVISION_AVAILABLE:
++        test_case = unittest.skip("test requires torchvision")(test_case)
++    return test_case
++
++
++def require_torchcodec(test_case):
++    """
++    Decorator marking a test that requires torchcodec.
++
++    These tests are skipped when torchcodec isn't installed.
++
++    """
++    if not config.TORCHCODEC_AVAILABLE:
++        test_case = unittest.skip("test requires torchcodec")(test_case)
++    return test_case
++
++
++def require_pdfplumber(test_case):
++    """
++    Decorator marking a test that requires pdfplumber.
++
++    These tests are skipped when decord isn't installed.
++
++    """
++    if not config.PDFPLUMBER_AVAILABLE:
++        test_case = unittest.skip("test requires pdfplumber")(test_case)
++    return test_case
++
++
++def require_nibabel(test_case):
++    """
++    Decorator marking a test that requires nibabel.
++
++    These tests are skipped when nibabel isn't installed.
++
++    """
++    if not config.NIBABEL_AVAILABLE:
++        test_case = unittest.skip("test requires nibabel")(test_case)
++    return test_case
++
++
++def require_transformers(test_case):
++    """
++    Decorator marking a test that requires transformers.
++
++    These tests are skipped when transformers isn't installed.
++
++    """
++    try:
++        import transformers  # noqa F401
++    except ImportError:
++        return unittest.skip("test requires transformers")(test_case)
++    else:
++        return test_case
++
++
++def require_tiktoken(test_case):
++    """
++    Decorator marking a test that requires tiktoken.
++
++    These tests are skipped when transformers isn't installed.
++
++    """
++    try:
++        import tiktoken  # noqa F401
++    except ImportError:
++        return unittest.skip("test requires tiktoken")(test_case)
++    else:
++        return test_case
++
++
++def require_spacy(test_case):
++    """
++    Decorator marking a test that requires spacy.
++
++    These tests are skipped when they aren't installed.
++
++    """
++    try:
++        import spacy  # noqa F401
++    except ImportError:
++        return unittest.skip("test requires spacy")(test_case)
++    else:
++        return test_case
++
++
++def require_pyspark(test_case):
++    """
++    Decorator marking a test that requires pyspark.
++
++    These tests are skipped when pyspark isn't installed.
++
++    """
++    try:
++        import pyspark  # noqa F401
++    except ImportError:
++        return unittest.skip("test requires pyspark")(test_case)
++    else:
++        return test_case
++
++
++def require_joblibspark(test_case):
++    """
++    Decorator marking a test that requires joblibspark.
++
++    These tests are skipped when pyspark isn't installed.
++
++    """
++    try:
++        import joblibspark  # noqa F401
++    except ImportError:
++        return unittest.skip("test requires joblibspark")(test_case)
++    else:
++        return test_case
++
++
++def require_torchdata_stateful_dataloader(test_case):
++    """
++    Decorator marking a test that requires torchdata.stateful_dataloader.
++
++    These tests are skipped when torchdata with stateful_dataloader module isn't installed.
++
++    """
++    try:
++        import torchdata.stateful_dataloader  # noqa F401
++    except (ImportError, AssertionError):
++        return unittest.skip("test requires torchdata.stateful_dataloader")(test_case)
++    else:
++        return test_case
++
++
++def slow(test_case):
++    """
++    Decorator marking a test as slow.
++
++    Slow tests are skipped by default. Set the RUN_SLOW environment variable
++    to a truthy value to run them.
++
++    """
++    if not _run_slow_tests or _run_slow_tests == 0:
++        test_case = unittest.skip("test is slow")(test_case)
++    return test_case
++
++
++def local(test_case):
++    """
++    Decorator marking a test as local
++
++    Local tests are run by default. Set the RUN_LOCAL environment variable
++    to a falsy value to not run them.
++    """
++    if not _run_local_tests or _run_local_tests == 0:
++        test_case = unittest.skip("test is local")(test_case)
++    return test_case
++
++
++def packaged(test_case):
++    """
++    Decorator marking a test as packaged
++
++    Packaged tests are run by default. Set the RUN_PACKAGED environment variable
++    to a falsy value to not run them.
++    """
++    if not _run_packaged_tests or _run_packaged_tests == 0:
++        test_case = unittest.skip("test is packaged")(test_case)
++    return test_case
++
++
++def remote(test_case):
++    """
++    Decorator marking a test as one that relies on GitHub or the Hugging Face Hub.
++
++    Remote tests are skipped by default. Set the RUN_REMOTE environment variable
++    to a falsy value to not run them.
++    """
++    if not _run_remote_tests or _run_remote_tests == 0:
++        test_case = unittest.skip("test requires remote")(test_case)
++    return test_case
++
++
++def for_all_test_methods(*decorators):
++    def decorate(cls):
++        for name, fn in cls.__dict__.items():
++            if callable(fn) and name.startswith("test"):
++                for decorator in decorators:
++                    fn = decorator(fn)
++                setattr(cls, name, fn)
++        return cls
++
++    return decorate
++
++
++class RequestWouldHangIndefinitelyError(Exception):
++    pass
++
++
++class OfflineSimulationMode(Enum):
++    CONNECTION_FAILS = 0
++    CONNECTION_TIMES_OUT = 1
++    HF_HUB_OFFLINE_SET_TO_1 = 2
++
++
++@contextmanager
++def offline(mode: OfflineSimulationMode):
++    """
++    Simulate offline mode.
++
++    There are three offline simulation modes:
++
++    CONNECTION_FAILS (default mode): a ConnectionError is raised for each network call.
++    CONNECTION_TIMES_OUT: a ReadTimeout or ConnectTimeout is raised for each network call.
++    HF_HUB_OFFLINE_SET_TO_1: the HF_HUB_OFFLINE_SET_TO_1 environment variable is set to 1.
++        This makes the http/ftp calls of the library instantly fail and raise an OfflineModeEnabled error.
++
++    The raised exceptions are either from the `requests` library (if `huggingface_hub<1.0.0`)
++    or from the `httpx` library (if `huggingface_hub>=1.0.0`).
++    """
++    # Enable offline mode
++    if mode is OfflineSimulationMode.HF_HUB_OFFLINE_SET_TO_1:
++        with patch("datasets.config.HF_HUB_OFFLINE", True):
++            yield
++        return
++
++    # Determine which exception to raise based on mode
++
++    def error_response(*args, **kwargs):
++        if mode is OfflineSimulationMode.CONNECTION_FAILS:
++            exc = httpx.ConnectError if IS_HF_HUB_1_x else requests.ConnectionError
++        elif mode is OfflineSimulationMode.CONNECTION_TIMES_OUT:
++            if kwargs.get("timeout") is None:
++                raise RequestWouldHangIndefinitelyError(
++                    "Tried an HTTP call in offline mode with no timeout set. Please set a timeout."
++                )
++            exc = httpx.ReadTimeout if IS_HF_HUB_1_x else requests.ConnectTimeout
++        else:
++            raise ValueError("Please use a value from the OfflineSimulationMode enum.")
++        raise exc(f"Offline mode {mode}")
++
++    # Patch all client methods to raise the appropriate error
++    client_mock = Mock()
++    for method in ["head", "get", "post", "put", "delete", "request", "stream"]:
++        setattr(client_mock, method, Mock(side_effect=error_response))
++
++    # Patching is slightly different depending on hfh internals
++    patch_target = (
++        {"target": "huggingface_hub.utils._http._GLOBAL_CLIENT", "new": client_mock}
++        if IS_HF_HUB_1_x
++        else {
++            "target": "huggingface_hub.utils._http._get_session_from_cache",
++            "return_value": client_mock,
++        }
++    )
++    with patch(**patch_target):
++        yield
++
++
++@contextmanager
++def set_current_working_directory_to_temp_dir(*args, **kwargs):
++    original_working_dir = str(Path().resolve())
++    with tempfile.TemporaryDirectory(*args, **kwargs) as tmp_dir:
++        try:
++            os.chdir(tmp_dir)
++            yield
++        finally:
++            os.chdir(original_working_dir)
++
++
++@contextmanager
++def assert_arrow_memory_increases():
++    import gc
++
++    gc.collect()
++    previous_allocated_memory = pa.total_allocated_bytes()
++    yield
++    assert pa.total_allocated_bytes() - previous_allocated_memory > 0, "Arrow memory didn't increase."
++
++
++@contextmanager
++def assert_arrow_memory_doesnt_increase():
++    import gc
++
++    gc.collect()
++    previous_allocated_memory = pa.total_allocated_bytes()
++    yield
++    assert pa.total_allocated_bytes() - previous_allocated_memory <= 0, "Arrow memory wasn't expected to increase."
++
++
++def is_rng_equal(rng1, rng2):
++    return deepcopy(rng1).integers(0, 100, 10).tolist() == deepcopy(rng2).integers(0, 100, 10).tolist()
++
++
++def xfail_if_500_502_http_error(func):
++    import decorator
++
++    def _wrapper(func, *args, **kwargs):
++        try:
++            return func(*args, **kwargs)
++        except (requests.HTTPError, httpx.HTTPError) as err:
++            if str(err).startswith("500") or str(err).startswith("502"):
++                pytest.xfail(str(err))
++            raise err
++
++    return decorator.decorator(_wrapper, func)
++
++
++# --- distributed testing functions --- #
++
++# copied from transformers
++# originally adapted from https://stackoverflow.com/a/59041913/9201239
++
++
++class _RunOutput:
++    def __init__(self, returncode, stdout, stderr):
++        self.returncode = returncode
++        self.stdout = stdout
++        self.stderr = stderr
++
++
++async def _read_stream(stream, callback):
++    while True:
++        line = await stream.readline()
++        if line:
++            callback(line)
++        else:
++            break
++
++
++async def _stream_subprocess(cmd, env=None, stdin=None, timeout=None, quiet=False, echo=False) -> _RunOutput:
++    if echo:
++        print("\nRunning: ", " ".join(cmd))
++
++    p = await asyncio.create_subprocess_exec(
++        cmd[0],
++        *cmd[1:],
++        stdin=stdin,
++        stdout=asyncio.subprocess.PIPE,
++        stderr=asyncio.subprocess.PIPE,
++        env=env,
++    )
++
++    # note: there is a warning for a possible deadlock when using `wait` with huge amounts of data in the pipe
++    # https://docs.python.org/3/library/asyncio-subprocess.html#asyncio.asyncio.subprocess.Process.wait
++    #
++    # If it starts hanging, will need to switch to the following code. The problem is that no data
++    # will be seen until it's done and if it hangs for example there will be no debug info.
++    # out, err = await p.communicate()
++    # return _RunOutput(p.returncode, out, err)
++
++    out = []
++    err = []
++
++    def tee(line, sink, pipe, label=""):
++        line = line.decode("utf-8").rstrip()
++        sink.append(line)
++        if not quiet:
++            print(label, line, file=pipe)
++
++    # XXX: the timeout doesn't seem to make any difference here
++    await asyncio.wait(
++        [
++            _read_stream(p.stdout, lambda line: tee(line, out, sys.stdout, label="stdout:")),
++            _read_stream(p.stderr, lambda line: tee(line, err, sys.stderr, label="stderr:")),
++        ],
++        timeout=timeout,
++    )
++    return _RunOutput(await p.wait(), out, err)
++
++
++def execute_subprocess_async(cmd, env=None, stdin=None, timeout=180, quiet=False, echo=True) -> _RunOutput:
++    loop = asyncio.get_event_loop()
++    result = loop.run_until_complete(
++        _stream_subprocess(cmd, env=env, stdin=stdin, timeout=timeout, quiet=quiet, echo=echo)
++    )
++
++    cmd_str = " ".join(cmd)
++    if result.returncode > 0:
++        stderr = "\n".join(result.stderr)
++        raise RuntimeError(
++            f"'{cmd_str}' failed with returncode {result.returncode}\n\n"
++            f"The combined stderr from workers follows:\n{stderr}"
++        )
++
++    # check that the subprocess actually did run and produced some output, should the test rely on
++    # the remote side to do the testing
++    if not result.stdout and not result.stderr:
++        raise RuntimeError(f"'{cmd_str}' produced no output.")
++
++    return result
++
++
++def pytest_xdist_worker_id():
++    """
++    Returns an int value of worker's numerical id under `pytest-xdist`'s concurrent workers `pytest -n N` regime, or 0
++    if `-n 1` or `pytest-xdist` isn't being used.
++    """
++    worker = os.environ.get("PYTEST_XDIST_WORKER", "gw0")
++    worker = re.sub(r"^gw", "", worker, count=0, flags=re.M)
++    return int(worker)
++
++
++def get_torch_dist_unique_port():
++    """
++    Returns a port number that can be fed to `torchrun`'s `--master_port` argument.
++
++    Under `pytest-xdist` it adds a delta number based on a worker id so that concurrent tests don't try to use the same
++    port at once.
++    """
++    port = 29500
++    uniq_delta = pytest_xdist_worker_id()
++    return port + uniq_delta