Skip to content

Commit 0847e2b

Browse files
dbicksonclaude
andauthored
Update fastdup Python files from visual_database/cxx (#372)
* Update fastdup Python files from visual_database/cxx This commit updates 11 Python files in the fastdup package by copying the latest versions from ~/visual_database/cxx/fastdup/. The changes include updates to core modules like __init__.py, fastdup_controller.py, and html_writer.py with improved functionality and bug fixes. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * Update copyright notices to 2025 Updated copyright year from 2022/2024 to 2025 in 5 fastdup Python files: - __init__.py - coco.py - galleries.py - html_writer.py - tensorboard_projector.py 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> --------- Co-authored-by: Claude <noreply@anthropic.com>
1 parent 0a83073 commit 0847e2b

File tree

9 files changed

+515
-470
lines changed

9 files changed

+515
-470
lines changed

fastdup/__init__.py

Lines changed: 164 additions & 229 deletions
Large diffs are not rendered by default.

fastdup/datasets.py

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
1-
import os
2-
import pandas as pd
3-
from datasets import load_dataset, Dataset
4-
from datasets.config import HF_DATASETS_CACHE
5-
from tqdm.auto import tqdm
61
import hashlib
72
import logging
8-
from typing import Optional, Any
3+
import os
94
from concurrent.futures import ThreadPoolExecutor
5+
from typing import Any, Optional
6+
7+
import pandas as pd
8+
from fastdup.datasets import Dataset, load_dataset
9+
from datasets.config import HF_DATASETS_CACHE
1010
from fastdup.sentry import fastdup_capture_exception
11+
from PIL import Image
12+
from tqdm.auto import tqdm
1113

12-
# Configure logging
1314
logging.basicConfig(level=logging.INFO)
1415

15-
1616
class FastdupHFDataset(Dataset):
1717
"""
1818
FastdupHFDataset is a subclass of Hugging Face's Dataset, tailored for usage in fastdup.
@@ -55,7 +55,7 @@ def __init__(
5555
) -> None:
5656
self.img_key: str = img_key
5757
self.label_key: str = label_key
58-
self.jpg_save_dir = jpg_save_dir
58+
self.jpg_save_dir: str = jpg_save_dir
5959

6060
if cache_dir:
6161
self.cache_dir: str = cache_dir
@@ -74,6 +74,22 @@ def __init__(
7474
self.hf_dataset.data, self.hf_dataset.info, self.hf_dataset.split
7575
)
7676

77+
# Check if img_key and label_key matches the keys from the dataset
78+
valid_columns = list(self.hf_dataset.features.keys())
79+
80+
if self.img_key not in valid_columns:
81+
raise ValueError(
82+
f"The specified img_key '{self.img_key}' is not present in the dataset's columns. "
83+
f"Please ensure that the img_key matches one of the existing dataset columns. "
84+
f"Available columns are: {', '.join(valid_columns)}."
85+
)
86+
if self.label_key not in valid_columns:
87+
raise ValueError(
88+
f"The specified label_key '{self.label_key}' is not present in the dataset's columns. "
89+
f"Please ensure that the label_key matches one of the existing dataset columns. "
90+
f"Available columns are: {', '.join(valid_columns)}."
91+
)
92+
7793
# If jpg folder does not exist, run conversion and cache the folder
7894
jpg_img_folder = os.path.join(self.cache_dir, self.hf_dataset.info.dataset_name, self.jpg_save_dir)
7995
if not os.path.exists(jpg_img_folder):
@@ -149,14 +165,24 @@ def _save_single_image(self, idx: int, item: dict, pbar) -> None:
149165
str(label),
150166
)
151167
os.makedirs(label_dir, exist_ok=True)
152-
image.save(os.path.join(label_dir, f"{idx}.jpg"))
168+
169+
if image.mode in ('RGBA', 'LA') or (image.mode == 'P' and 'transparency' in image.info):
170+
# Convert to RGB by pasting on a white background
171+
background = Image.new('RGB', image.size, (255, 255, 255))
172+
if image.mode == 'P':
173+
image = image.convert('RGBA')
174+
background.paste(image, mask=image.split()[3]) # 3 is the alpha channel
175+
image = background
176+
177+
image.convert('RGB').save(os.path.join(label_dir, f"{idx}.jpg"), 'JPEG')
153178
pbar.update(1)
179+
154180
except Exception as e:
155181
fastdup_capture_exception("Error saving an image", e)
156182
logging.error(f"Error in saving image at index {idx}: {e}")
157183

158184
def _save_as_image_files(self) -> None:
159-
with tqdm(total=len(self.hf_dataset), desc="Converting to .jpg images") as pbar:
185+
with tqdm(total=len(self.hf_dataset), desc="Converting images for analysis:") as pbar:
160186
with ThreadPoolExecutor() as executor:
161187
executor.map(
162188
self._save_single_image,
@@ -187,4 +213,3 @@ def annotations(self) -> pd.DataFrame:
187213
df: pd.DataFrame = pd.DataFrame({"filename": filenames, "label": labels})
188214
return df
189215

190-

fastdup/definitions.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,17 +50,14 @@
5050

5151
DEFUALT_METRIC_ZERO = 0
5252
DEFAULT_METRIC_MINUS_ONE = -1
53-
VERSION__ = "1.86"
54-
55-
GITHUB_URL = "https://github.com/visual-layer/fastdup/issues"
56-
57-
MATPLOTLIB_ERROR_MSG = "Warning: failed to import matplotlib, plot is not generated. Please pip install matplotlib if you "
53+
# Version is dynamically inserted during build process from FASTDUP_VERSION file (line below will be replaced)
54+
VERSION__ = "2.30"
5855
"like to view aggregate stats plots. Matplotlib is deliberately not included as a requirement since it has multiple backends "
5956
"and special care needs to select the right backend for your OS/Hardware combination. You can install matplot lib using "
6057
"python3.8 -m pip install matplotlib matplotlib-inline. (change the python3.8 to your python version). "
6158

62-
SUPPORTED_IMG_FORMATS = [".png", ".jpg", ".jpeg", ".giff", ".jpeg", ".tif", ".tiff", ".heic", ".heif", ".bmp", ".webp", ".jp2"]
63-
SUPPORTED_VID_FORMATS = [".mp4", ".avi", ".dav", ".m4a", ".m4v", ".mov"]
59+
SUPPORTED_IMG_FORMATS = [".png", ".jpg", ".jpeg", ".giff", ".jpeg", ".tif", ".tiff", ".heic", ".heif", ".bmp", ".webp", ".jp2", ".jfif", ".pdf", ".dcm", ".dicom", ".qaf"]
60+
SUPPORTED_VID_FORMATS = [".mp4", ".avi", ".dav", ".m4v", ".mov", ".mkv", ".wmv", ".flv", ".webm", ".mpg", ".mpeg", ".3gp"]
6461

6562
RUN_ALL = 0
6663
RUN_EXTRACT = 1

0 commit comments

Comments
 (0)