Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/linters/eslint.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ export default [
ctx: 'readonly',
constants: 'readonly',
reports: 'readonly',
reservations: 'readonly'
reservations: 'readonly',
descriptions: 'readonly'
}
},
rules: {
Expand Down
43 changes: 3 additions & 40 deletions definitions/output/crawl/pages.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
const columns = descriptions.columns.pages

// See https://github.com/HTTPArchive/dataform/issues/43
assert('corrupted_technology_values')
.tags(['crawl_complete'])
Expand Down Expand Up @@ -53,46 +55,7 @@ publish('pages', {
clusterBy: ['client', 'is_root_page', 'rank', 'page'],
requirePartitionFilter: true
},
columns: {
date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl',
client: 'Test environment: desktop or mobile',
page: 'The URL of the page being tested',
is_root_page: 'Whether the page is the root of the origin',
root_page: 'The URL of the root page being tested, the origin followed by /',
rank: 'Site popularity rank, from CrUX',
wptid: 'ID of the WebPageTest results',
payload: 'JSON-encoded WebPageTest results for the page',
summary: 'JSON-encoded summarization of the page-level data',
custom_metrics: {
description: 'Custom metrics from WebPageTest',
columns: {
a11y: 'JSON-encoded A11Y metrics',
cms: 'JSON-encoded CMS detection',
cookies: 'JSON-encoded cookie metrics',
css_variables: 'JSON-encoded CSS variable metrics',
ecommerce: 'JSON-encoded ecommerce metrics',
element_count: 'JSON-encoded element count metrics',
javascript: 'JSON-encoded JavaScript metrics',
markup: 'JSON-encoded markup metrics',
media: 'JSON-encoded media metrics',
origin_trials: 'JSON-encoded origin trial metrics',
performance: 'JSON-encoded performance metrics',
privacy: 'JSON-encoded privacy metrics',
responsive_images: 'JSON-encoded responsive image metrics',
robots_txt: 'JSON-encoded robots.txt metrics',
security: 'JSON-encoded security metrics',
structured_data: 'JSON-encoded structured data metrics',
third_parties: 'JSON-encoded third-party metrics',
well_known: 'JSON-encoded well-known metrics',
wpt_bodies: 'JSON-encoded WebPageTest bodies',
other: 'JSON-encoded other custom metrics'
}
},
lighthouse: 'JSON-encoded Lighthouse report',
features: 'Blink features detected at runtime (see https://chromestatus.com/features)',
technologies: 'Technologies detected at runtime (see https://www.wappalyzer.com/)',
metadata: 'Additional metadata about the test'
},
columns: columns,
tags: ['crawl_complete'],
dependOnDependencyAssertions: true
}).preOps(ctx => `
Expand Down
13 changes: 3 additions & 10 deletions definitions/output/crawl/parsed_css.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
const columns = descriptions.columns.parsed_css

publish('parsed_css', {
type: 'incremental',
protected: true,
Expand All @@ -7,16 +9,7 @@ publish('parsed_css', {
clusterBy: ['client', 'is_root_page', 'rank', 'page'],
requirePartitionFilter: true
},
columns: {
date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl',
client: 'Test environment: desktop or mobile',
page: 'The URL of the page being tested',
is_root_page: 'Whether the page is the root of the origin.',
root_page: 'The URL of the root page being tested',
rank: 'Site popularity rank, from CrUX',
url: 'The URL of the request',
css: 'The parsed CSS, in JSON format'
},
columns: columns,
tags: ['crawl_complete']
}).preOps(ctx => `
${reservations.reservation_setter(ctx)}
Expand Down
32 changes: 3 additions & 29 deletions definitions/output/crawl/requests.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
const columns = descriptions.columns.requests

publish('requests', {
type: 'incremental',
protected: true,
Expand All @@ -7,35 +9,7 @@ publish('requests', {
clusterBy: ['client', 'is_root_page', 'type', 'rank'],
requirePartitionFilter: true
},
columns: {
date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl',
client: 'Test environment: desktop or mobile',
page: 'The URL of the page being tested',
is_root_page: 'Whether the page is the root of the origin.',
root_page: 'The URL of the root page being tested',
rank: 'Site popularity rank, from CrUX',
url: 'The URL of the request',
is_main_document: 'Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects',
type: 'Simplified description of the type of resource (script, html, css, text, other, etc)',
index: 'The sequential 0-based index of the request',
payload: 'JSON-encoded WebPageTest result data for this request',
summary: 'JSON-encoded summarization of request data',
request_headers: {
description: 'Request headers',
columns: {
name: 'Request header name',
value: 'Request header value'
}
},
response_headers: {
description: 'Response headers',
columns: {
name: 'Response header name',
value: 'Response header value'
}
},
response_body: 'Text-based response body'
},
columns: columns,
tags: ['crawl_complete']
}).preOps(ctx => `
${reservations.reservation_setter(ctx)}
Expand Down
29 changes: 29 additions & 0 deletions definitions/output/latest/pages.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
const columns = descriptions.columns.pages

publish('pages', {
type: 'view',
schema: 'latest',
columns: columns
}).query(ctx => `
SELECT
*
FROM ${ctx.ref('crawl', 'pages')}
WHERE
date = (
SELECT
PARSE_DATE('%Y%m%d', MAX(partition_id)) AS date
FROM
httparchive.crawl.INFORMATION_SCHEMA.PARTITIONS
WHERE
table_name = 'pages' AND
/* Only include actual dates in partition ids */
partition_id >= '20250101' AND
partition_id < '20990101' AND
/* Exclude future dates - shouldn't be any, but you never know! */
partition_id <= FORMAT_DATE('%Y%m%d', CURRENT_DATE())
) AND
/* The following should help make this even faster since above query is a little complex */
/* We should never be more than 60 days old hopefully! */
date >= DATE_SUB(CURRENT_DATE(), INTERVAL 61 DAY) AND
date <= CURRENT_DATE()
`)
29 changes: 29 additions & 0 deletions definitions/output/latest/parsed_css.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
const columns = descriptions.columns.parsed_css

publish('parsed_css', {
type: 'view',
schema: 'latest',
columns: columns,
}).query(ctx => `
SELECT
*
FROM ${ctx.ref('crawl', 'parsed_css')}
WHERE
date = (
SELECT
PARSE_DATE('%Y%m%d', MAX(partition_id)) AS date
FROM
httparchive.crawl.INFORMATION_SCHEMA.PARTITIONS
WHERE
table_name = 'parsed_css' AND
/* Only include actual dates in partition ids */
partition_id >= '20250101' AND
partition_id < '20990101' AND
/* Exclude future dates - shouldn't be any, but you never know! */
partition_id <= FORMAT_DATE('%Y%m%d', CURRENT_DATE())
) AND
/* The following should help make this even faster since above query is a little complex */
/* We should never be more than 60 days old hopefully! */
date >= DATE_SUB(CURRENT_DATE(), INTERVAL 61 DAY) AND
date <= CURRENT_DATE()
`)
30 changes: 30 additions & 0 deletions definitions/output/latest/requests.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
const columns = descriptions.columns.requests

publish('requests', {
type: 'view',
schema: 'latest',
columns: columns,
}).query(ctx => `
SELECT
*
FROM
${ctx.ref('crawl', 'requests')}
WHERE
date = (
SELECT
PARSE_DATE('%Y%m%d', MAX(partition_id)) AS date
FROM
httparchive.crawl.INFORMATION_SCHEMA.PARTITIONS
WHERE
table_name = 'requests' AND
/* Only include actual dates in partition ids */
partition_id >= '20250101' AND
partition_id < '20990101' AND
/* Exclude future dates - shouldn't be any, but you never know! */
partition_id <= FORMAT_DATE('%Y%m%d', CURRENT_DATE())
) AND
/* The following should help make this even faster since above query is a little complex */
/* We should never be more than 60 days old hopefully! */
date >= DATE_SUB(CURRENT_DATE(), INTERVAL 61 DAY) AND
date <= CURRENT_DATE()
`)
3 changes: 3 additions & 0 deletions definitions/output/sample_data/pages_10k.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
const columns = descriptions.columns.pages

publish('pages_10k', {
type: 'table',
schema: 'sample_data',
bigquery: {
partitionBy: 'date',
clusterBy: ['client', 'is_root_page', 'rank', 'page']
},
columns: columns,
tags: ['crawl_complete']
}).query(ctx => `
SELECT *
Expand Down
3 changes: 3 additions & 0 deletions definitions/output/sample_data/parsed_css_10k.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
const columns = descriptions.columns.parsed_css

publish('parsed_css_10k', {
type: 'table',
schema: 'sample_data',
bigquery: {
partitionBy: 'date',
clusterBy: ['client', 'is_root_page', 'rank', 'page']
},
columns: columns,
tags: ['crawl_complete']
}).query(ctx => `
SELECT *
Expand Down
3 changes: 3 additions & 0 deletions definitions/output/sample_data/requests_10k.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
const columns = descriptions.columns.requests

publish('requests_10k', {
type: 'table',
schema: 'sample_data',
bigquery: {
partitionBy: 'date',
clusterBy: ['client', 'is_root_page', 'rank', 'type']
},
columns: columns,
tags: ['crawl_complete']
}).query(ctx => `
SELECT *
Expand Down
86 changes: 86 additions & 0 deletions includes/descriptions.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@

const columns = {
pages: {
date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl',
client: 'Test environment: desktop or mobile',
page: 'The URL of the page being tested',
is_root_page: 'Whether the page is the root of the origin',
root_page: 'The URL of the root page being tested, the origin followed by /',
rank: 'Site popularity rank, from CrUX',
wptid: 'ID of the WebPageTest results',
payload: 'JSON-encoded WebPageTest results for the page',
summary: 'JSON-encoded summarization of the page-level data',
custom_metrics: {
description: 'Custom metrics from WebPageTest',
columns: {
a11y: 'JSON-encoded A11Y metrics',
cms: 'JSON-encoded CMS detection',
cookies: 'JSON-encoded cookie metrics',
css_variables: 'JSON-encoded CSS variable metrics',
ecommerce: 'JSON-encoded ecommerce metrics',
element_count: 'JSON-encoded element count metrics',
javascript: 'JSON-encoded JavaScript metrics',
markup: 'JSON-encoded markup metrics',
media: 'JSON-encoded media metrics',
origin_trials: 'JSON-encoded origin trial metrics',
performance: 'JSON-encoded performance metrics',
privacy: 'JSON-encoded privacy metrics',
responsive_images: 'JSON-encoded responsive image metrics',
robots_txt: 'JSON-encoded robots.txt metrics',
security: 'JSON-encoded security metrics',
structured_data: 'JSON-encoded structured data metrics',
third_parties: 'JSON-encoded third-party metrics',
well_known: 'JSON-encoded well-known metrics',
wpt_bodies: 'JSON-encoded WebPageTest bodies',
other: 'JSON-encoded other custom metrics'
}
},
lighthouse: 'JSON-encoded Lighthouse report',
features: 'Blink features detected at runtime (see https://chromestatus.com/features)',
technologies: 'Technologies detected at runtime (see https://www.wappalyzer.com/)',
metadata: 'Additional metadata about the test'
},
requests: {
date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl',
client: 'Test environment: desktop or mobile',
page: 'The URL of the page being tested',
is_root_page: 'Whether the page is the root of the origin.',
root_page: 'The URL of the root page being tested',
rank: 'Site popularity rank, from CrUX',
url: 'The URL of the request',
is_main_document: 'Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects',
type: 'Simplified description of the type of resource (script, html, css, text, other, etc)',
index: 'The sequential 0-based index of the request',
payload: 'JSON-encoded WebPageTest result data for this request',
summary: 'JSON-encoded summarization of request data',
request_headers: {
description: 'Request headers',
columns: {
name: 'Request header name',
value: 'Request header value'
}
},
response_headers: {
description: 'Response headers',
columns: {
name: 'Response header name',
value: 'Response header value'
}
},
response_body: 'Text-based response body'
},
parsed_css: {
date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl',
client: 'Test environment: desktop or mobile',
page: 'The URL of the page being tested',
is_root_page: 'Whether the page is the root of the origin.',
root_page: 'The URL of the root page being tested',
rank: 'Site popularity rank, from CrUX',
url: 'The URL of the request',
css: 'The parsed CSS, in JSON format'
}
}

module.exports = {
columns
}