From f0d6ef1ad9c58282d22d52dca85443c30bc40f57 Mon Sep 17 00:00:00 2001
From: Arjun Dinesh Jagdale <142811259+ArjunJagdale@users.noreply.github.com>
Date: Tue, 24 Jun 2025 19:15:43 +0530
Subject: [PATCH] feat(libcommon): add duckdb-based leak detection prototype
 (#2994)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR adds a prototype module for detecting data leaks and duplicates between dataset splits, as suggested in issue #2994 and inspired by the [LLE blog post](https://huggingface.co/blog/lbourdois/lle).

### 🔍 What it does:
- Introduces `detect_leaks(dataset_name, subset=None)` in `libs/libcommon/src/libcommon/leak_detection.py`
- Uses DuckDB to efficiently compute:
  - Leaks from train → test
  - Leaks from validation → test
  - Duplicates within train/validation/test splits
- Supports datasets with `tokens`, `text`, and optionally `ner_tags`

### 🛠️ Example usage:
```bash
python libs/libcommon/src/libcommon/leak_detection.py
Will output stats for conll2003 like:

yaml
Copy
Edit
train_test_leaks: 78
validation_test_leaks: 25
train_dup: 1350
validation_dup: 180
test_dup: 269
📌 Notes:
This is a standalone prototype and not yet wired into jobs or UI.

Future steps may include integrating this into dataset QA dashboards or automated Hub checks.

Closes #2994
---
 .../libcommon/src/libcommon/leak_detection.py | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 libs/libcommon/src/libcommon/leak_detection.py

diff --git a/libs/libcommon/src/libcommon/leak_detection.py b/libs/libcommon/src/libcommon/leak_detection.py
new file mode 100644
index 0000000000..3db2393254
--- /dev/null
+++ b/libs/libcommon/src/libcommon/leak_detection.py
@@ -0,0 +1,44 @@
+import duckdb
+from datasets import load_dataset
+
+def to_duckdb_table(dataset_split):
+    df = dataset_split.to_pandas()
+    df['text'] = df.get('tokens', df.get('text', df.astype(str))).astype(str)
+    if 'ner_tags' in df:
+        df['text_label'] = df['text'] + ' ' + df['ner_tags'].astype(str)
+    else:
+        df['text_label'] = df['text']
+    return df[['text', 'text_label']]
+
+def detect_leaks(dataset_name, subset=None):
+    print(f"Loading: {dataset_name} {f'({subset})' if subset else ''}")
+    ds = load_dataset(dataset_name, subset) if subset else load_dataset(dataset_name)
+    con = duckdb.connect()
+
+    for split in ['train', 'validation', 'test']:
+        if split in ds:
+            df = to_duckdb_table(ds[split])
+            con.register(split, df)
+
+    results = {}
+
+    queries = {
+        "train_test_leaks": "SELECT COUNT(*) FROM train INNER JOIN test ON train.text = test.text",
+        "validation_test_leaks": "SELECT COUNT(*) FROM validation INNER JOIN test ON validation.text = test.text",
+        "train_dup": "SELECT COUNT(*) FROM (SELECT text FROM train GROUP BY text HAVING COUNT(*) > 1)",
+        "validation_dup": "SELECT COUNT(*) FROM (SELECT text FROM validation GROUP BY text HAVING COUNT(*) > 1)",
+        "test_dup": "SELECT COUNT(*) FROM (SELECT text FROM test GROUP BY text HAVING COUNT(*) > 1)"
+    }
+
+    for name, sql in queries.items():
+        try:
+            results[name] = con.execute(sql).fetchone()[0]
+        except Exception as e:
+            results[name] = f"error: {e}"
+
+    return results
+
+if __name__ == "__main__":
+    stats = detect_leaks("conll2003")
+    for k, v in stats.items():
+        print(f"{k}: {v}")