From 43d61cc499eafd931c247d7890e91fa6964320c6 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 21 Dec 2025 19:10:56 +0100 Subject: [PATCH 1/3] Refactor output definitions to use centralized column descriptions and create new latest and sample_data files --- definitions/output/crawl/pages.js | 43 +--------- definitions/output/crawl/parsed_css.js | 13 +-- definitions/output/crawl/requests.js | 32 +------ definitions/output/latest/pages.js | 29 +++++++ definitions/output/latest/parsed_css.js | 29 +++++++ definitions/output/latest/requests.js | 30 +++++++ definitions/output/sample_data/pages_10k.js | 3 + .../output/sample_data/parsed_css_10k.js | 3 + .../output/sample_data/requests_10k.js | 3 + includes/shared/descriptions.js | 86 +++++++++++++++++++ 10 files changed, 192 insertions(+), 79 deletions(-) create mode 100644 definitions/output/latest/pages.js create mode 100644 definitions/output/latest/parsed_css.js create mode 100644 definitions/output/latest/requests.js create mode 100644 includes/shared/descriptions.js diff --git a/definitions/output/crawl/pages.js b/definitions/output/crawl/pages.js index 589b0068..510d863c 100644 --- a/definitions/output/crawl/pages.js +++ b/definitions/output/crawl/pages.js @@ -1,3 +1,5 @@ +const columns = descriptions.columns.pages; + // See https://github.com/HTTPArchive/dataform/issues/43 assert('corrupted_technology_values') .tags(['crawl_complete']) @@ -53,46 +55,7 @@ publish('pages', { clusterBy: ['client', 'is_root_page', 'rank', 'page'], requirePartitionFilter: true }, - columns: { - date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl', - client: 'Test environment: desktop or mobile', - page: 'The URL of the page being tested', - is_root_page: 'Whether the page is the root of the origin', - root_page: 'The URL of the root page being tested, the origin followed by /', - rank: 'Site popularity rank, from CrUX', - wptid: 'ID of the WebPageTest results', - payload: 'JSON-encoded WebPageTest results for the page', - summary: 'JSON-encoded summarization of the page-level data', - custom_metrics: { - description: 'Custom metrics from WebPageTest', - columns: { - a11y: 'JSON-encoded A11Y metrics', - cms: 'JSON-encoded CMS detection', - cookies: 'JSON-encoded cookie metrics', - css_variables: 'JSON-encoded CSS variable metrics', - ecommerce: 'JSON-encoded ecommerce metrics', - element_count: 'JSON-encoded element count metrics', - javascript: 'JSON-encoded JavaScript metrics', - markup: 'JSON-encoded markup metrics', - media: 'JSON-encoded media metrics', - origin_trials: 'JSON-encoded origin trial metrics', - performance: 'JSON-encoded performance metrics', - privacy: 'JSON-encoded privacy metrics', - responsive_images: 'JSON-encoded responsive image metrics', - robots_txt: 'JSON-encoded robots.txt metrics', - security: 'JSON-encoded security metrics', - structured_data: 'JSON-encoded structured data metrics', - third_parties: 'JSON-encoded third-party metrics', - well_known: 'JSON-encoded well-known metrics', - wpt_bodies: 'JSON-encoded WebPageTest bodies', - other: 'JSON-encoded other custom metrics' - } - }, - lighthouse: 'JSON-encoded Lighthouse report', - features: 'Blink features detected at runtime (see https://chromestatus.com/features)', - technologies: 'Technologies detected at runtime (see https://www.wappalyzer.com/)', - metadata: 'Additional metadata about the test' - }, + columns: columns, tags: ['crawl_complete'], dependOnDependencyAssertions: true }).preOps(ctx => ` diff --git a/definitions/output/crawl/parsed_css.js b/definitions/output/crawl/parsed_css.js index 41f37c7b..3274af5c 100644 --- a/definitions/output/crawl/parsed_css.js +++ b/definitions/output/crawl/parsed_css.js @@ -1,3 +1,5 @@ +const columns = descriptions.columns.parsed_css; + publish('parsed_css', { type: 'incremental', protected: true, @@ -7,16 +9,7 @@ publish('parsed_css', { clusterBy: ['client', 'is_root_page', 'rank', 'page'], requirePartitionFilter: true }, - columns: { - date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl', - client: 'Test environment: desktop or mobile', - page: 'The URL of the page being tested', - is_root_page: 'Whether the page is the root of the origin.', - root_page: 'The URL of the root page being tested', - rank: 'Site popularity rank, from CrUX', - url: 'The URL of the request', - css: 'The parsed CSS, in JSON format' - }, + columns: columns, tags: ['crawl_complete'] }).preOps(ctx => ` ${reservations.reservation_setter(ctx)} diff --git a/definitions/output/crawl/requests.js b/definitions/output/crawl/requests.js index eb8d0d3b..ffb0211e 100644 --- a/definitions/output/crawl/requests.js +++ b/definitions/output/crawl/requests.js @@ -1,3 +1,5 @@ +const columns = descriptions.columns.requests; + publish('requests', { type: 'incremental', protected: true, @@ -7,35 +9,7 @@ publish('requests', { clusterBy: ['client', 'is_root_page', 'type', 'rank'], requirePartitionFilter: true }, - columns: { - date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl', - client: 'Test environment: desktop or mobile', - page: 'The URL of the page being tested', - is_root_page: 'Whether the page is the root of the origin.', - root_page: 'The URL of the root page being tested', - rank: 'Site popularity rank, from CrUX', - url: 'The URL of the request', - is_main_document: 'Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects', - type: 'Simplified description of the type of resource (script, html, css, text, other, etc)', - index: 'The sequential 0-based index of the request', - payload: 'JSON-encoded WebPageTest result data for this request', - summary: 'JSON-encoded summarization of request data', - request_headers: { - description: 'Request headers', - columns: { - name: 'Request header name', - value: 'Request header value' - } - }, - response_headers: { - description: 'Response headers', - columns: { - name: 'Response header name', - value: 'Response header value' - } - }, - response_body: 'Text-based response body' - }, + columns: columns, tags: ['crawl_complete'] }).preOps(ctx => ` ${reservations.reservation_setter(ctx)} diff --git a/definitions/output/latest/pages.js b/definitions/output/latest/pages.js new file mode 100644 index 00000000..417e5ce1 --- /dev/null +++ b/definitions/output/latest/pages.js @@ -0,0 +1,29 @@ +const columns = descriptions.columns.pages; + +publish('pages', { + type: 'view', + schema: 'latest', + columns: columns +}).query(ctx => ` +SELECT + * +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = ( + SELECT + PARSE_DATE('%Y%m%d', MAX(partition_id)) AS date + FROM + httparchive.crawl.INFORMATION_SCHEMA.PARTITIONS + WHERE + table_name = 'pages' AND + /* Only include actual dates in partition ids */ + partition_id >= '20250101' AND + partition_id < '20990101' AND + /* Exclude future dates - shouldn't be any, but you never know! */ + partition_id <= FORMAT_DATE('%Y%m%d', CURRENT_DATE()) + ) AND + /* The following should help make this even faster since above query is a little complex */ + /* We should never be more than 60 days old hopefully! */ + date >= DATE_SUB(CURRENT_DATE(), INTERVAL 61 DAY) AND + date <= CURRENT_DATE() +`); diff --git a/definitions/output/latest/parsed_css.js b/definitions/output/latest/parsed_css.js new file mode 100644 index 00000000..02f4d4ca --- /dev/null +++ b/definitions/output/latest/parsed_css.js @@ -0,0 +1,29 @@ +const columns = descriptions.columns.parsed_css; + +publish('parsed_css', { + type: 'view', + schema: 'latest', + columns: columns, +}).query(ctx => ` +SELECT + * +FROM ${ctx.ref('crawl', 'parsed_css')} +WHERE + date = ( + SELECT + PARSE_DATE('%Y%m%d', MAX(partition_id)) AS date + FROM + httparchive.crawl.INFORMATION_SCHEMA.PARTITIONS + WHERE + table_name = 'parsed_css' AND + /* Only include actual dates in partition ids */ + partition_id >= '20250101' AND + partition_id < '20990101' AND + /* Exclude future dates - shouldn't be any, but you never know! */ + partition_id <= FORMAT_DATE('%Y%m%d', CURRENT_DATE()) + ) AND + /* The following should help make this even faster since above query is a little complex */ + /* We should never be more than 60 days old hopefully! */ + date >= DATE_SUB(CURRENT_DATE(), INTERVAL 61 DAY) AND + date <= CURRENT_DATE() +`) diff --git a/definitions/output/latest/requests.js b/definitions/output/latest/requests.js new file mode 100644 index 00000000..37aafc1d --- /dev/null +++ b/definitions/output/latest/requests.js @@ -0,0 +1,30 @@ +const columns = descriptions.columns.requests; + +publish('requests', { + type: 'view', + schema: 'latest', + columns: columns, +}).query(ctx => ` +SELECT + * +FROM + ${ctx.ref('crawl', 'requests')} +WHERE + date = ( + SELECT + PARSE_DATE('%Y%m%d', MAX(partition_id)) AS date + FROM + httparchive.crawl.INFORMATION_SCHEMA.PARTITIONS + WHERE + table_name = 'requests' AND + /* Only include actual dates in partition ids */ + partition_id >= '20250101' AND + partition_id < '20990101' AND + /* Exclude future dates - shouldn't be any, but you never know! */ + partition_id <= FORMAT_DATE('%Y%m%d', CURRENT_DATE()) + ) AND + /* The following should help make this even faster since above query is a little complex */ + /* We should never be more than 60 days old hopefully! */ + date >= DATE_SUB(CURRENT_DATE(), INTERVAL 61 DAY) AND + date <= CURRENT_DATE() +`) diff --git a/definitions/output/sample_data/pages_10k.js b/definitions/output/sample_data/pages_10k.js index 0d8b67e6..1e5e56e4 100644 --- a/definitions/output/sample_data/pages_10k.js +++ b/definitions/output/sample_data/pages_10k.js @@ -1,3 +1,5 @@ +const columns = descriptions.columns.pages; + publish('pages_10k', { type: 'table', schema: 'sample_data', @@ -5,6 +7,7 @@ publish('pages_10k', { partitionBy: 'date', clusterBy: ['client', 'is_root_page', 'rank', 'page'] }, + columns: columns, tags: ['crawl_complete'] }).query(ctx => ` SELECT * diff --git a/definitions/output/sample_data/parsed_css_10k.js b/definitions/output/sample_data/parsed_css_10k.js index 1e9c7f47..05fa0bcb 100644 --- a/definitions/output/sample_data/parsed_css_10k.js +++ b/definitions/output/sample_data/parsed_css_10k.js @@ -1,3 +1,5 @@ +const columns = descriptions.columns.parsed_css; + publish('parsed_css_10k', { type: 'table', schema: 'sample_data', @@ -5,6 +7,7 @@ publish('parsed_css_10k', { partitionBy: 'date', clusterBy: ['client', 'is_root_page', 'rank', 'page'] }, + columns: columns, tags: ['crawl_complete'] }).query(ctx => ` SELECT * diff --git a/definitions/output/sample_data/requests_10k.js b/definitions/output/sample_data/requests_10k.js index 0fcf1192..8ef7b290 100644 --- a/definitions/output/sample_data/requests_10k.js +++ b/definitions/output/sample_data/requests_10k.js @@ -1,3 +1,5 @@ +const columns = descriptions.columns.requests; + publish('requests_10k', { type: 'table', schema: 'sample_data', @@ -5,6 +7,7 @@ publish('requests_10k', { partitionBy: 'date', clusterBy: ['client', 'is_root_page', 'rank', 'type'] }, + columns: columns, tags: ['crawl_complete'] }).query(ctx => ` SELECT * diff --git a/includes/shared/descriptions.js b/includes/shared/descriptions.js new file mode 100644 index 00000000..963fceaa --- /dev/null +++ b/includes/shared/descriptions.js @@ -0,0 +1,86 @@ + +const columns = { + pages: { + date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl', + client: 'Test environment: desktop or mobile', + page: 'The URL of the page being tested', + is_root_page: 'Whether the page is the root of the origin', + root_page: 'The URL of the root page being tested, the origin followed by /', + rank: 'Site popularity rank, from CrUX', + wptid: 'ID of the WebPageTest results', + payload: 'JSON-encoded WebPageTest results for the page', + summary: 'JSON-encoded summarization of the page-level data', + custom_metrics: { + description: 'Custom metrics from WebPageTest', + columns: { + a11y: 'JSON-encoded A11Y metrics', + cms: 'JSON-encoded CMS detection', + cookies: 'JSON-encoded cookie metrics', + css_variables: 'JSON-encoded CSS variable metrics', + ecommerce: 'JSON-encoded ecommerce metrics', + element_count: 'JSON-encoded element count metrics', + javascript: 'JSON-encoded JavaScript metrics', + markup: 'JSON-encoded markup metrics', + media: 'JSON-encoded media metrics', + origin_trials: 'JSON-encoded origin trial metrics', + performance: 'JSON-encoded performance metrics', + privacy: 'JSON-encoded privacy metrics', + responsive_images: 'JSON-encoded responsive image metrics', + robots_txt: 'JSON-encoded robots.txt metrics', + security: 'JSON-encoded security metrics', + structured_data: 'JSON-encoded structured data metrics', + third_parties: 'JSON-encoded third-party metrics', + well_known: 'JSON-encoded well-known metrics', + wpt_bodies: 'JSON-encoded WebPageTest bodies', + other: 'JSON-encoded other custom metrics' + } + }, + lighthouse: 'JSON-encoded Lighthouse report', + features: 'Blink features detected at runtime (see https://chromestatus.com/features)', + technologies: 'Technologies detected at runtime (see https://www.wappalyzer.com/)', + metadata: 'Additional metadata about the test' + }, + requests: { + date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl', + client: 'Test environment: desktop or mobile', + page: 'The URL of the page being tested', + is_root_page: 'Whether the page is the root of the origin.', + root_page: 'The URL of the root page being tested', + rank: 'Site popularity rank, from CrUX', + url: 'The URL of the request', + is_main_document: 'Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects', + type: 'Simplified description of the type of resource (script, html, css, text, other, etc)', + index: 'The sequential 0-based index of the request', + payload: 'JSON-encoded WebPageTest result data for this request', + summary: 'JSON-encoded summarization of request data', + request_headers: { + description: 'Request headers', + columns: { + name: 'Request header name', + value: 'Request header value' + } + }, + response_headers: { + description: 'Response headers', + columns: { + name: 'Response header name', + value: 'Response header value' + } + }, + response_body: 'Text-based response body' + }, + parsed_css: { + date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl', + client: 'Test environment: desktop or mobile', + page: 'The URL of the page being tested', + is_root_page: 'Whether the page is the root of the origin.', + root_page: 'The URL of the root page being tested', + rank: 'Site popularity rank, from CrUX', + url: 'The URL of the request', + css: 'The parsed CSS, in JSON format' + } +} + +module.exports = { + columns +}; From f8141c63fdd93f113363ba8a6bdc0e0f69265c50 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 21 Dec 2025 19:17:21 +0100 Subject: [PATCH 2/3] Add descriptions for columns in the WebPageTest data structure --- includes/{shared => }/descriptions.js | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename includes/{shared => }/descriptions.js (100%) diff --git a/includes/shared/descriptions.js b/includes/descriptions.js similarity index 100% rename from includes/shared/descriptions.js rename to includes/descriptions.js From 639d0a3d2d5247427e8140d9f8cfe322127136f6 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 21 Dec 2025 19:30:08 +0100 Subject: [PATCH 3/3] lint --- .github/linters/eslint.config.mjs | 3 ++- definitions/output/crawl/pages.js | 2 +- definitions/output/crawl/parsed_css.js | 2 +- definitions/output/crawl/requests.js | 2 +- definitions/output/latest/pages.js | 4 ++-- definitions/output/latest/parsed_css.js | 2 +- definitions/output/latest/requests.js | 2 +- definitions/output/sample_data/pages_10k.js | 2 +- definitions/output/sample_data/parsed_css_10k.js | 2 +- definitions/output/sample_data/requests_10k.js | 2 +- includes/descriptions.js | 2 +- 11 files changed, 13 insertions(+), 12 deletions(-) diff --git a/.github/linters/eslint.config.mjs b/.github/linters/eslint.config.mjs index 00e64dcd..fa0c5d48 100644 --- a/.github/linters/eslint.config.mjs +++ b/.github/linters/eslint.config.mjs @@ -22,7 +22,8 @@ export default [ ctx: 'readonly', constants: 'readonly', reports: 'readonly', - reservations: 'readonly' + reservations: 'readonly', + descriptions: 'readonly' } }, rules: { diff --git a/definitions/output/crawl/pages.js b/definitions/output/crawl/pages.js index 510d863c..d2cc2815 100644 --- a/definitions/output/crawl/pages.js +++ b/definitions/output/crawl/pages.js @@ -1,4 +1,4 @@ -const columns = descriptions.columns.pages; +const columns = descriptions.columns.pages // See https://github.com/HTTPArchive/dataform/issues/43 assert('corrupted_technology_values') diff --git a/definitions/output/crawl/parsed_css.js b/definitions/output/crawl/parsed_css.js index 3274af5c..bffaf1d8 100644 --- a/definitions/output/crawl/parsed_css.js +++ b/definitions/output/crawl/parsed_css.js @@ -1,4 +1,4 @@ -const columns = descriptions.columns.parsed_css; +const columns = descriptions.columns.parsed_css publish('parsed_css', { type: 'incremental', diff --git a/definitions/output/crawl/requests.js b/definitions/output/crawl/requests.js index ffb0211e..3ece636f 100644 --- a/definitions/output/crawl/requests.js +++ b/definitions/output/crawl/requests.js @@ -1,4 +1,4 @@ -const columns = descriptions.columns.requests; +const columns = descriptions.columns.requests publish('requests', { type: 'incremental', diff --git a/definitions/output/latest/pages.js b/definitions/output/latest/pages.js index 417e5ce1..127e6e3b 100644 --- a/definitions/output/latest/pages.js +++ b/definitions/output/latest/pages.js @@ -1,4 +1,4 @@ -const columns = descriptions.columns.pages; +const columns = descriptions.columns.pages publish('pages', { type: 'view', @@ -26,4 +26,4 @@ WHERE /* We should never be more than 60 days old hopefully! */ date >= DATE_SUB(CURRENT_DATE(), INTERVAL 61 DAY) AND date <= CURRENT_DATE() -`); +`) diff --git a/definitions/output/latest/parsed_css.js b/definitions/output/latest/parsed_css.js index 02f4d4ca..8157c9cd 100644 --- a/definitions/output/latest/parsed_css.js +++ b/definitions/output/latest/parsed_css.js @@ -1,4 +1,4 @@ -const columns = descriptions.columns.parsed_css; +const columns = descriptions.columns.parsed_css publish('parsed_css', { type: 'view', diff --git a/definitions/output/latest/requests.js b/definitions/output/latest/requests.js index 37aafc1d..aa783412 100644 --- a/definitions/output/latest/requests.js +++ b/definitions/output/latest/requests.js @@ -1,4 +1,4 @@ -const columns = descriptions.columns.requests; +const columns = descriptions.columns.requests publish('requests', { type: 'view', diff --git a/definitions/output/sample_data/pages_10k.js b/definitions/output/sample_data/pages_10k.js index 1e5e56e4..ffb65ab8 100644 --- a/definitions/output/sample_data/pages_10k.js +++ b/definitions/output/sample_data/pages_10k.js @@ -1,4 +1,4 @@ -const columns = descriptions.columns.pages; +const columns = descriptions.columns.pages publish('pages_10k', { type: 'table', diff --git a/definitions/output/sample_data/parsed_css_10k.js b/definitions/output/sample_data/parsed_css_10k.js index 05fa0bcb..4fe570a9 100644 --- a/definitions/output/sample_data/parsed_css_10k.js +++ b/definitions/output/sample_data/parsed_css_10k.js @@ -1,4 +1,4 @@ -const columns = descriptions.columns.parsed_css; +const columns = descriptions.columns.parsed_css publish('parsed_css_10k', { type: 'table', diff --git a/definitions/output/sample_data/requests_10k.js b/definitions/output/sample_data/requests_10k.js index 8ef7b290..444f0386 100644 --- a/definitions/output/sample_data/requests_10k.js +++ b/definitions/output/sample_data/requests_10k.js @@ -1,4 +1,4 @@ -const columns = descriptions.columns.requests; +const columns = descriptions.columns.requests publish('requests_10k', { type: 'table', diff --git a/includes/descriptions.js b/includes/descriptions.js index 963fceaa..4c8d0df8 100644 --- a/includes/descriptions.js +++ b/includes/descriptions.js @@ -83,4 +83,4 @@ const columns = { module.exports = { columns -}; +}