From f0d6ef1ad9c58282d22d52dca85443c30bc40f57 Mon Sep 17 00:00:00 2001 From: Arjun Dinesh Jagdale <142811259+ArjunJagdale@users.noreply.github.com> Date: Tue, 24 Jun 2025 19:15:43 +0530 Subject: [PATCH] feat(libcommon): add duckdb-based leak detection prototype (#2994) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds a prototype module for detecting data leaks and duplicates between dataset splits, as suggested in issue #2994 and inspired by the [LLE blog post](https://huggingface.co/blog/lbourdois/lle). ### 🔍 What it does: - Introduces `detect_leaks(dataset_name, subset=None)` in `libs/libcommon/src/libcommon/leak_detection.py` - Uses DuckDB to efficiently compute: - Leaks from train → test - Leaks from validation → test - Duplicates within train/validation/test splits - Supports datasets with `tokens`, `text`, and optionally `ner_tags` ### 🛠️ Example usage: ```bash python libs/libcommon/src/libcommon/leak_detection.py Will output stats for conll2003 like: yaml Copy Edit train_test_leaks: 78 validation_test_leaks: 25 train_dup: 1350 validation_dup: 180 test_dup: 269 📌 Notes: This is a standalone prototype and not yet wired into jobs or UI. Future steps may include integrating this into dataset QA dashboards or automated Hub checks. Closes #2994 --- .../libcommon/src/libcommon/leak_detection.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 libs/libcommon/src/libcommon/leak_detection.py diff --git a/libs/libcommon/src/libcommon/leak_detection.py b/libs/libcommon/src/libcommon/leak_detection.py new file mode 100644 index 0000000000..3db2393254 --- /dev/null +++ b/libs/libcommon/src/libcommon/leak_detection.py @@ -0,0 +1,44 @@ +import duckdb +from datasets import load_dataset + +def to_duckdb_table(dataset_split): + df = dataset_split.to_pandas() + df['text'] = df.get('tokens', df.get('text', df.astype(str))).astype(str) + if 'ner_tags' in df: + df['text_label'] = df['text'] + ' ' + df['ner_tags'].astype(str) + else: + df['text_label'] = df['text'] + return df[['text', 'text_label']] + +def detect_leaks(dataset_name, subset=None): + print(f"Loading: {dataset_name} {f'({subset})' if subset else ''}") + ds = load_dataset(dataset_name, subset) if subset else load_dataset(dataset_name) + con = duckdb.connect() + + for split in ['train', 'validation', 'test']: + if split in ds: + df = to_duckdb_table(ds[split]) + con.register(split, df) + + results = {} + + queries = { + "train_test_leaks": "SELECT COUNT(*) FROM train INNER JOIN test ON train.text = test.text", + "validation_test_leaks": "SELECT COUNT(*) FROM validation INNER JOIN test ON validation.text = test.text", + "train_dup": "SELECT COUNT(*) FROM (SELECT text FROM train GROUP BY text HAVING COUNT(*) > 1)", + "validation_dup": "SELECT COUNT(*) FROM (SELECT text FROM validation GROUP BY text HAVING COUNT(*) > 1)", + "test_dup": "SELECT COUNT(*) FROM (SELECT text FROM test GROUP BY text HAVING COUNT(*) > 1)" + } + + for name, sql in queries.items(): + try: + results[name] = con.execute(sql).fetchone()[0] + except Exception as e: + results[name] = f"error: {e}" + + return results + +if __name__ == "__main__": + stats = detect_leaks("conll2003") + for k, v in stats.items(): + print(f"{k}: {v}")