11import pandas as pd
2+ import logging
23from datetime import datetime
34# TODO (Find & Fix)
45from typing import Optional
56
7+
8+ logging .basicConfig (
9+ level = logging .INFO ,
10+ format = "%(asctime)s - %(levelname)s - %(message)s"
11+ )
12+ logger = logging .getLogger (__name__ )
13+
614def transform (df : pd .DataFrame ) -> pd .DataFrame :
715 """
816 Transform data by cleaning and standardizing it.
@@ -14,31 +22,38 @@ def transform(df: pd.DataFrame) -> pd.DataFrame:
1422 Transformed DataFrame
1523 """
1624 if df .empty :
25+ raise ValueError ("DataFrame is Empty." )
1726 # TODO (Find & Fix): Should raise a ValueError if DataFrame is empty
18- pass
27+
1928
2029 # Create a copy to avoid modifying original
2130 df_transformed = df .copy ()
2231
23- print (f"🔄 Starting transformation of { len (df_transformed )} rows" ) # TODO (Find & Fix): Use logging instead of print
32+ logger . info (f"🔄 Starting transformation of { len (df_transformed )} rows" ) # TODO (Find & Fix): Use logging instead of print
2433
2534 # Handle duplicates
2635 initial_rows = len (df_transformed )
27- # TODO (Find & Fix): Duplicates are not removed
36+ df_transformed . drop_duplicates ( inplace = True ) # TODO (Find & Fix): Duplicates are not removed
2837 duplicates_removed = initial_rows - len (df_transformed )
2938 if duplicates_removed > 0 :
39+ logger .info (f"Removed { duplicates_removed } duplicate rows." )
3040 # TODO (Find & Fix): Should log how many duplicates were removed
3141 pass
3242
3343 # Handle null values in numeric columns
3444 numeric_columns = df_transformed .select_dtypes (include = ['number' ]).columns
3545 for col in numeric_columns :
46+ if df_transformed [col ].isnull ().any ():
47+ mean_value = df_transformed [col ].mean ()
48+ df_transformed [col ].fillna (mean_value , inplace = True )
3649 # TODO (Find & Fix): Nulls in numeric columns are not handled
3750 pass
3851
3952 # Handle null values in text columns
4053 text_columns = df_transformed .select_dtypes (include = ['object' ]).columns
4154 for col in text_columns :
55+ if df_transformed [col ].isnull ().any ():
56+ df_transformed [col ].fillna ("Unknown" , inplace = True )
4257 # TODO (Find & Fix): Nulls in text columns are not handled
4358 pass
4459
@@ -47,8 +62,15 @@ def transform(df: pd.DataFrame) -> pd.DataFrame:
4762 if any (keyword in col .lower () for keyword in ['date' , 'time' , 'created' , 'updated' ])]
4863
4964 for col in date_columns :
50- # TODO (Find & Fix): Date columns are not standardized
51- pass
65+ df_transformed [col ] = pd .to_datetime (df_transformed [col ], errors = 'coerce' )
66+ if df_transformed [col ].isnull ().any ():
67+ median_date = df_transformed [col ].median ()
68+ df_transformed [col ].fillna (median_date , inplace = True )# TODO (Find & Fix): Date columns are not standardized
69+
70+ for col in text_columns :
71+ df_transformed [col ] = df_transformed [col ].astype (str ).str .strip ().str .lower ()
72+
73+ logger .info ("Transformation completed successfully." )
5274
5375 # TODO (Find & Fix): Text columns are not cleaned (strip, lowercase)
5476 return df_transformed
0 commit comments