visual-layer
diff --git a/‎fastdup/__init__.py
Lines changed: 164 additions & 229 deletions b/‎fastdup/__init__.py
Lines changed: 164 additions & 229 deletions
diff --git a/‎fastdup/datasets.py
Lines changed: 37 additions & 12 deletions b/‎fastdup/datasets.py
Lines changed: 37 additions & 12 deletions
diff --git a/‎fastdup/definitions.py
Lines changed: 4 additions & 7 deletions b/‎fastdup/definitions.py
Lines changed: 4 additions & 7 deletions
@@ -1,18 +1,18 @@
-import os
-import pandas as pd
-from datasets import load_dataset, Dataset
-from datasets.config import HF_DATASETS_CACHE
-from tqdm.auto import tqdm
 import hashlib
 import logging
-from typing import Optional, Any
+import os
 from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Optional
+
+import pandas as pd
+from fastdup.datasets import Dataset, load_dataset
+from datasets.config import HF_DATASETS_CACHE
 from fastdup.sentry import fastdup_capture_exception
+from PIL import Image
+from tqdm.auto import tqdm
 
-# Configure logging
 logging.basicConfig(level=logging.INFO)
 
-
 class FastdupHFDataset(Dataset):
     """
     FastdupHFDataset is a subclass of Hugging Face's Dataset, tailored for usage in fastdup.
@@ -55,7 +55,7 @@ def __init__(
     ) -> None:
         self.img_key: str = img_key
         self.label_key: str = label_key
-        self.jpg_save_dir = jpg_save_dir
+        self.jpg_save_dir: str = jpg_save_dir
 
         if cache_dir:
             self.cache_dir: str = cache_dir
@@ -74,6 +74,22 @@ def __init__(
             self.hf_dataset.data, self.hf_dataset.info, self.hf_dataset.split
         )
 
+        # Check if img_key and label_key matches the keys from the dataset
+        valid_columns = list(self.hf_dataset.features.keys())
+
+        if self.img_key not in valid_columns:
+            raise ValueError(
+                f"The specified img_key '{self.img_key}' is not present in the dataset's columns. "
+                f"Please ensure that the img_key matches one of the existing dataset columns. "
+                f"Available columns are: {', '.join(valid_columns)}."
+            )
+        if self.label_key not in valid_columns:
+            raise ValueError(
+               f"The specified label_key '{self.label_key}' is not present in the dataset's columns. "
+               f"Please ensure that the label_key matches one of the existing dataset columns. "
+               f"Available columns are: {', '.join(valid_columns)}."
+            )
+
         # If jpg folder does not exist, run conversion and cache the folder
         jpg_img_folder = os.path.join(self.cache_dir, self.hf_dataset.info.dataset_name, self.jpg_save_dir)
         if not os.path.exists(jpg_img_folder):
@@ -149,14 +165,24 @@ def _save_single_image(self, idx: int, item: dict, pbar) -> None:
                 str(label),
             )
             os.makedirs(label_dir, exist_ok=True)
-            image.save(os.path.join(label_dir, f"{idx}.jpg"))
+
+            if image.mode in ('RGBA', 'LA') or (image.mode == 'P' and 'transparency' in image.info):
+                # Convert to RGB by pasting on a white background
+                background = Image.new('RGB', image.size, (255, 255, 255))
+                if image.mode == 'P':
+                    image = image.convert('RGBA')
+                background.paste(image, mask=image.split()[3])  # 3 is the alpha channel
+                image = background
+
+            image.convert('RGB').save(os.path.join(label_dir, f"{idx}.jpg"), 'JPEG')
             pbar.update(1)
+
         except Exception as e:
             fastdup_capture_exception("Error saving an image", e)
             logging.error(f"Error in saving image at index {idx}: {e}")
 
     def _save_as_image_files(self) -> None:
-        with tqdm(total=len(self.hf_dataset), desc="Converting to .jpg images") as pbar:
+        with tqdm(total=len(self.hf_dataset), desc="Converting images for analysis:") as pbar:
             with ThreadPoolExecutor() as executor:
                 executor.map(
                     self._save_single_image,
@@ -187,4 +213,3 @@ def annotations(self) -> pd.DataFrame:
         df: pd.DataFrame = pd.DataFrame({"filename": filenames, "label": labels})
         return df
 
-
 
@@ -50,17 +50,14 @@
 
 DEFUALT_METRIC_ZERO = 0
 DEFAULT_METRIC_MINUS_ONE = -1
-VERSION__ = "1.86"
-
-GITHUB_URL = "https://github.com/visual-layer/fastdup/issues"
-
-MATPLOTLIB_ERROR_MSG = "Warning: failed to import matplotlib, plot is not generated. Please pip install matplotlib if you "
+# Version is dynamically inserted during build process from FASTDUP_VERSION file (line below will be replaced)
+VERSION__ = "2.30"
 "like to view aggregate stats plots. Matplotlib is deliberately not included as a requirement since it has multiple backends "
 "and special care needs to select the right backend for your OS/Hardware combination. You can install matplot lib using "
 "python3.8 -m pip install matplotlib matplotlib-inline. (change the python3.8 to your python version). "
 
-SUPPORTED_IMG_FORMATS = [".png", ".jpg", ".jpeg", ".giff", ".jpeg", ".tif", ".tiff", ".heic", ".heif", ".bmp", ".webp", ".jp2"]
-SUPPORTED_VID_FORMATS = [".mp4", ".avi", ".dav", ".m4a", ".m4v", ".mov"]
+SUPPORTED_IMG_FORMATS = [".png", ".jpg", ".jpeg", ".giff", ".jpeg", ".tif", ".tiff", ".heic", ".heif", ".bmp", ".webp", ".jp2", ".jfif", ".pdf", ".dcm", ".dicom", ".qaf"]
+SUPPORTED_VID_FORMATS = [".mp4", ".avi", ".dav", ".m4v", ".mov", ".mkv", ".wmv", ".flv", ".webm", ".mpg", ".mpeg", ".3gp"]
 
 RUN_ALL = 0
 RUN_EXTRACT = 1