Skip to content

Commit f0eb6b6

Browse files
authored
Merge branch 'main' into adf_publish
2 parents 2abebab + c9a53a2 commit f0eb6b6

File tree

10 files changed

+341
-16
lines changed

10 files changed

+341
-16
lines changed

code/datafactory/dataflow/IptvCuratedToKusto.json

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@
1414
},
1515
"name": "AzureDataLakeSource",
1616
"description": "Source dataset in Azure Data Lake"
17+
},
18+
{
19+
"linkedService": {
20+
"referenceName": "AzureDataExplorer",
21+
"type": "LinkedServiceReference"
22+
},
23+
"name": "KustoSinkReference",
24+
"description": "Kusto Sink Reference Dataset"
1725
}
1826
],
1927
"sinks": [
@@ -30,7 +38,11 @@
3038
}
3139
}
3240
],
33-
"transformations": [],
41+
"transformations": [
42+
{
43+
"name": "ExistingHashes"
44+
}
45+
],
3446
"scriptLines": [
3547
"parameters{",
3648
" sourceFileSystem as string,",
@@ -55,19 +67,48 @@
5567
" Topology5 as string,",
5668
" TvModel as string,",
5769
" UserId as long,",
58-
" EndTime as timestamp",
70+
" EndTime as timestamp,",
71+
" Hash as string",
5972
" ),",
6073
" allowSchemaDrift: true,",
6174
" validateSchema: false,",
6275
" ignoreNoFilesFound: false,",
6376
" format: 'delta',",
6477
" fileSystem: ($sourceFileSystem),",
6578
" folderPath: ($sourceFolderPath)) ~> AzureDataLakeSource",
66-
"AzureDataLakeSource sink(allowSchemaDrift: true,",
79+
"source(output(",
80+
" AppVersion as string,",
81+
" Country as string,",
82+
" HappinessScore as double,",
83+
" HwModel as string,",
84+
" Service as string,",
85+
" ServiceType as string,",
86+
" StbModel as string,",
87+
" StreamingProtocol as string,",
88+
" Title as string,",
89+
" Topology2 as string,",
90+
" Topology3 as string,",
91+
" Topology4 as string,",
92+
" Topology5 as string,",
93+
" TvModel as string,",
94+
" UserId as string,",
95+
" EndTime as timestamp,",
96+
" Hash as string",
97+
" ),",
98+
" allowSchemaDrift: true,",
99+
" validateSchema: false,",
100+
" format: 'table',",
101+
" tableName: ($sinkTable),",
102+
" store: 'azuredataexplorer') ~> KustoSinkReference",
103+
"AzureDataLakeSource, KustoSinkReference exists(AzureDataLakeSource@Hash == KustoSinkReference@Hash,",
104+
" negate:true,",
105+
" broadcast: 'auto')~> ExistingHashes",
106+
"ExistingHashes sink(allowSchemaDrift: true,",
67107
" validateSchema: false,",
68108
" format: 'table',",
69109
" tableName: ($sinkTable),",
70110
" store: 'azuredataexplorer',",
111+
" postSQLs:[(concat('.delete table ', $sinkTable, ' records with (whatif=false) <| ', $sinkTable, ' | sort by Hash, ingestion_time() desc | where row_cumsum(1,prev(Hash) != Hash) > 1'))],",
71112
" skipDuplicateMapInputs: true,",
72113
" skipDuplicateMapOutputs: true,",
73114
" outputAssertFailedRows: true,",

code/datafactory/dataflow/IptvRawToCurated.json

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232
}
3333
],
3434
"transformations": [
35+
{
36+
"name": "DistinctRows"
37+
},
3538
{
3639
"name": "RenameColumns",
3740
"description": "Rename Columns"
@@ -84,7 +87,9 @@
8487
" escapeChar: '\\\\',",
8588
" quoteChar: '\\\"',",
8689
" columnNamesAsHeader: true) ~> AzureDataLakeSource",
87-
"AzureDataLakeSource select(mapColumn(",
90+
"AzureDataLakeSource aggregate(groupBy(hash = sha2(256,columns())),",
91+
" each(match(true()), $$ = first($$))) ~> DistinctRows",
92+
"DistinctRows select(mapColumn(",
8893
" AppVersion = app_version,",
8994
" Country = country,",
9095
" HappinessScore = happiness_score,",
@@ -100,7 +105,8 @@
100105
" Topology5 = topology_5,",
101106
" TvModel = tv_model,",
102107
" UserId = user_id,",
103-
" EndTime = end_time",
108+
" EndTime = end_time,",
109+
" Hash = hash",
104110
" ),",
105111
" skipDuplicateMapInputs: true,",
106112
" skipDuplicateMapOutputs: true) ~> RenameColumns",
@@ -119,7 +125,7 @@
119125
" insertable: false,",
120126
" updateable: false,",
121127
" upsertable: true,",
122-
" keys:['AppVersion','Country','HwModel','Service','ServiceType','StbModel','StreamingProtocol','Title','Topology2','Topology3','Topology4','Topology5','TvModel','UserId','EndTime'],",
128+
" keys:['Hash'],",
123129
" umask: 0022,",
124130
" preCommands: [],",
125131
" postCommands: [],",
@@ -129,7 +135,7 @@
129135
" assertFailure_fileSystem: ($logsFileSystem),",
130136
" assertFailure_folderPath: ($logsFolderPath),",
131137
" partitionBy('key',",
132-
" 0,",
138+
" 1,",
133139
" Country",
134140
" )) ~> AzureDataLakeSink"
135141
]

code/datafactory/dataflow/OttCuratedToKusto.json

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@
1414
},
1515
"name": "AzureDataLakeSource",
1616
"description": "Source dataset in Azure Data Lake"
17+
},
18+
{
19+
"linkedService": {
20+
"referenceName": "AzureDataExplorer",
21+
"type": "LinkedServiceReference"
22+
},
23+
"name": "KustoSinkReference",
24+
"description": "Kusto Sink Reference Dataset"
1725
}
1826
],
1927
"sinks": [
@@ -30,7 +38,11 @@
3038
}
3139
}
3240
],
33-
"transformations": [],
41+
"transformations": [
42+
{
43+
"name": "ExistingHashes"
44+
}
45+
],
3446
"scriptLines": [
3547
"parameters{",
3648
" sourceFileSystem as string,",
@@ -53,19 +65,46 @@
5365
" DeviceVendor as string,",
5466
" HappinessScore as float,",
5567
" UserId as long,",
56-
" EndTime as timestamp",
68+
" EndTime as timestamp,",
69+
" Hash as string",
5770
" ),",
5871
" allowSchemaDrift: true,",
5972
" validateSchema: false,",
6073
" ignoreNoFilesFound: false,",
6174
" format: 'delta',",
6275
" fileSystem: ($sourceFileSystem),",
6376
" folderPath: ($sourceFolderPath)) ~> AzureDataLakeSource",
64-
"AzureDataLakeSource sink(allowSchemaDrift: true,",
77+
"source(output(",
78+
" Country as string,",
79+
" Isp as string,",
80+
" CdnNodeHost as string,",
81+
" Type as string,",
82+
" Title as string,",
83+
" SelectedQuality as string,",
84+
" DeviceType as string,",
85+
" Version as string,",
86+
" Connection as string,",
87+
" CommercilizationType as string,",
88+
" DeviceVendor as string,",
89+
" HappinessScore as double,",
90+
" UserId as string,",
91+
" EndTime as timestamp,",
92+
" Hash as string",
93+
" ),",
94+
" allowSchemaDrift: true,",
95+
" validateSchema: false,",
96+
" format: 'table',",
97+
" tableName: ($sinkTable),",
98+
" store: 'azuredataexplorer') ~> KustoSinkReference",
99+
"AzureDataLakeSource, KustoSinkReference exists(AzureDataLakeSource@Hash == KustoSinkReference@Hash,",
100+
" negate:true,",
101+
" broadcast: 'auto')~> ExistingHashes",
102+
"ExistingHashes sink(allowSchemaDrift: true,",
65103
" validateSchema: false,",
66104
" format: 'table',",
67105
" tableName: ($sinkTable),",
68106
" store: 'azuredataexplorer',",
107+
" postSQLs:[(concat('.delete table ', $sinkTable, ' records with (whatif=false) <| ', $sinkTable, ' | sort by Hash, ingestion_time() desc | where row_cumsum(1,prev(Hash) != Hash) > 1'))],",
69108
" skipDuplicateMapInputs: true,",
70109
" skipDuplicateMapOutputs: true,",
71110
" outputAssertFailedRows: true,",

code/datafactory/dataflow/OttRawToCurated.json

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232
}
3333
],
3434
"transformations": [
35+
{
36+
"name": "DistinctRows"
37+
},
3538
{
3639
"name": "RenameColumns",
3740
"description": "Rename Columns"
@@ -86,7 +89,9 @@
8689
" escapeChar: '\\\\',",
8790
" quoteChar: '\\\"',",
8891
" columnNamesAsHeader: true) ~> AzureDataLakeSource",
89-
"AzureDataLakeSource select(mapColumn(",
92+
"AzureDataLakeSource aggregate(groupBy(hash = sha2(256,columns())),",
93+
" each(match(true()), $$ = first($$))) ~> DistinctRows",
94+
"DistinctRows select(mapColumn(",
9095
" Country,",
9196
" Isp = ISP,",
9297
" CdnNodeHost = {CDN Node Host},",
@@ -100,7 +105,8 @@
100105
" DeviceVendor = {Device Vendor},",
101106
" HappinessScore = {Happiness Score},",
102107
" UserId = {User ID},",
103-
" EndTime = {End Time}",
108+
" EndTime = {End Time},",
109+
" Hash = hash",
104110
" ),",
105111
" skipDuplicateMapInputs: true,",
106112
" skipDuplicateMapOutputs: true) ~> RenameColumns",
@@ -121,7 +127,7 @@
121127
" insertable: false,",
122128
" updateable: false,",
123129
" upsertable: true,",
124-
" keys:['Country','Isp','CdnNodeHost','Type','Title','SelectedQuality','DeviceType','Version','Connection','CommercilizationType','DeviceVendor','UserId','EndTime'],",
130+
" keys:['Hash'],",
125131
" umask: 0022,",
126132
" preCommands: [],",
127133
" postCommands: [],",

code/datafactory/factory/sc4-dev-df001.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,18 @@
1313
"minimumNumberOfUsersPerWindow": {
1414
"type": "float",
1515
"value": 10
16+
},
17+
"minimumPlays": {
18+
"type": "float",
19+
"value": 20
20+
},
21+
"minimumLift": {
22+
"type": "float",
23+
"value": 5
24+
},
25+
"minimumUsersRatio": {
26+
"type": "float",
27+
"value": 0.1
1628
}
1729
}
1830
},

0 commit comments

Comments
 (0)