import org.apache.spark.sql.functions.{year, month, day, hour, bucket} val matches = spark.read.option("header", "true") .option("inferSchema", "true") .csv("/home/iceberg/data/matches.csv") spark.sql(f"""DROP TABLE IF EXISTS bootcamp.matches_bucketed PURGE""") spark.sql(f""" CREATE OR REPLACE TABLE bootcamp.matches_bucketed ( match_id STRING, is_team_game BOOLEAN, playlist_id STRING, completion_date TIMESTAMP ) USING iceberg PARTITIONED BY (years(completion_date), bucket(16, match_id)); """) matches.select($"match_id",$"is_team_game", $"playlist_id", $"completion_date") .writeTo("bootcamp.matches_bucketed ") .overwritePartitions() // FOR VALIDATION : 16 buckets * 2 years = 32 files spark.sql(f""" SELECT COUNT(*) FROM bootcamp.matches_bucketed .files """).show()