diff --git a/.github/linters/eslint.config.mjs b/.github/linters/eslint.config.mjs index 00e64dcd..fa0c5d48 100644 --- a/.github/linters/eslint.config.mjs +++ b/.github/linters/eslint.config.mjs @@ -22,7 +22,8 @@ export default [ ctx: 'readonly', constants: 'readonly', reports: 'readonly', - reservations: 'readonly' + reservations: 'readonly', + descriptions: 'readonly' } }, rules: { diff --git a/definitions/output/crawl/pages.js b/definitions/output/crawl/pages.js index 589b0068..d2cc2815 100644 --- a/definitions/output/crawl/pages.js +++ b/definitions/output/crawl/pages.js @@ -1,3 +1,5 @@ +const columns = descriptions.columns.pages + // See https://github.com/HTTPArchive/dataform/issues/43 assert('corrupted_technology_values') .tags(['crawl_complete']) @@ -53,46 +55,7 @@ publish('pages', { clusterBy: ['client', 'is_root_page', 'rank', 'page'], requirePartitionFilter: true }, - columns: { - date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl', - client: 'Test environment: desktop or mobile', - page: 'The URL of the page being tested', - is_root_page: 'Whether the page is the root of the origin', - root_page: 'The URL of the root page being tested, the origin followed by /', - rank: 'Site popularity rank, from CrUX', - wptid: 'ID of the WebPageTest results', - payload: 'JSON-encoded WebPageTest results for the page', - summary: 'JSON-encoded summarization of the page-level data', - custom_metrics: { - description: 'Custom metrics from WebPageTest', - columns: { - a11y: 'JSON-encoded A11Y metrics', - cms: 'JSON-encoded CMS detection', - cookies: 'JSON-encoded cookie metrics', - css_variables: 'JSON-encoded CSS variable metrics', - ecommerce: 'JSON-encoded ecommerce metrics', - element_count: 'JSON-encoded element count metrics', - javascript: 'JSON-encoded JavaScript metrics', - markup: 'JSON-encoded markup metrics', - media: 'JSON-encoded media metrics', - origin_trials: 'JSON-encoded origin trial metrics', - performance: 'JSON-encoded performance metrics', - privacy: 'JSON-encoded privacy metrics', - responsive_images: 'JSON-encoded responsive image metrics', - robots_txt: 'JSON-encoded robots.txt metrics', - security: 'JSON-encoded security metrics', - structured_data: 'JSON-encoded structured data metrics', - third_parties: 'JSON-encoded third-party metrics', - well_known: 'JSON-encoded well-known metrics', - wpt_bodies: 'JSON-encoded WebPageTest bodies', - other: 'JSON-encoded other custom metrics' - } - }, - lighthouse: 'JSON-encoded Lighthouse report', - features: 'Blink features detected at runtime (see https://chromestatus.com/features)', - technologies: 'Technologies detected at runtime (see https://www.wappalyzer.com/)', - metadata: 'Additional metadata about the test' - }, + columns: columns, tags: ['crawl_complete'], dependOnDependencyAssertions: true }).preOps(ctx => ` diff --git a/definitions/output/crawl/parsed_css.js b/definitions/output/crawl/parsed_css.js index 41f37c7b..bffaf1d8 100644 --- a/definitions/output/crawl/parsed_css.js +++ b/definitions/output/crawl/parsed_css.js @@ -1,3 +1,5 @@ +const columns = descriptions.columns.parsed_css + publish('parsed_css', { type: 'incremental', protected: true, @@ -7,16 +9,7 @@ publish('parsed_css', { clusterBy: ['client', 'is_root_page', 'rank', 'page'], requirePartitionFilter: true }, - columns: { - date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl', - client: 'Test environment: desktop or mobile', - page: 'The URL of the page being tested', - is_root_page: 'Whether the page is the root of the origin.', - root_page: 'The URL of the root page being tested', - rank: 'Site popularity rank, from CrUX', - url: 'The URL of the request', - css: 'The parsed CSS, in JSON format' - }, + columns: columns, tags: ['crawl_complete'] }).preOps(ctx => ` ${reservations.reservation_setter(ctx)} diff --git a/definitions/output/crawl/requests.js b/definitions/output/crawl/requests.js index eb8d0d3b..3ece636f 100644 --- a/definitions/output/crawl/requests.js +++ b/definitions/output/crawl/requests.js @@ -1,3 +1,5 @@ +const columns = descriptions.columns.requests + publish('requests', { type: 'incremental', protected: true, @@ -7,35 +9,7 @@ publish('requests', { clusterBy: ['client', 'is_root_page', 'type', 'rank'], requirePartitionFilter: true }, - columns: { - date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl', - client: 'Test environment: desktop or mobile', - page: 'The URL of the page being tested', - is_root_page: 'Whether the page is the root of the origin.', - root_page: 'The URL of the root page being tested', - rank: 'Site popularity rank, from CrUX', - url: 'The URL of the request', - is_main_document: 'Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects', - type: 'Simplified description of the type of resource (script, html, css, text, other, etc)', - index: 'The sequential 0-based index of the request', - payload: 'JSON-encoded WebPageTest result data for this request', - summary: 'JSON-encoded summarization of request data', - request_headers: { - description: 'Request headers', - columns: { - name: 'Request header name', - value: 'Request header value' - } - }, - response_headers: { - description: 'Response headers', - columns: { - name: 'Response header name', - value: 'Response header value' - } - }, - response_body: 'Text-based response body' - }, + columns: columns, tags: ['crawl_complete'] }).preOps(ctx => ` ${reservations.reservation_setter(ctx)} diff --git a/definitions/output/latest/pages.js b/definitions/output/latest/pages.js new file mode 100644 index 00000000..127e6e3b --- /dev/null +++ b/definitions/output/latest/pages.js @@ -0,0 +1,29 @@ +const columns = descriptions.columns.pages + +publish('pages', { + type: 'view', + schema: 'latest', + columns: columns +}).query(ctx => ` +SELECT + * +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = ( + SELECT + PARSE_DATE('%Y%m%d', MAX(partition_id)) AS date + FROM + httparchive.crawl.INFORMATION_SCHEMA.PARTITIONS + WHERE + table_name = 'pages' AND + /* Only include actual dates in partition ids */ + partition_id >= '20250101' AND + partition_id < '20990101' AND + /* Exclude future dates - shouldn't be any, but you never know! */ + partition_id <= FORMAT_DATE('%Y%m%d', CURRENT_DATE()) + ) AND + /* The following should help make this even faster since above query is a little complex */ + /* We should never be more than 60 days old hopefully! */ + date >= DATE_SUB(CURRENT_DATE(), INTERVAL 61 DAY) AND + date <= CURRENT_DATE() +`) diff --git a/definitions/output/latest/parsed_css.js b/definitions/output/latest/parsed_css.js new file mode 100644 index 00000000..8157c9cd --- /dev/null +++ b/definitions/output/latest/parsed_css.js @@ -0,0 +1,29 @@ +const columns = descriptions.columns.parsed_css + +publish('parsed_css', { + type: 'view', + schema: 'latest', + columns: columns, +}).query(ctx => ` +SELECT + * +FROM ${ctx.ref('crawl', 'parsed_css')} +WHERE + date = ( + SELECT + PARSE_DATE('%Y%m%d', MAX(partition_id)) AS date + FROM + httparchive.crawl.INFORMATION_SCHEMA.PARTITIONS + WHERE + table_name = 'parsed_css' AND + /* Only include actual dates in partition ids */ + partition_id >= '20250101' AND + partition_id < '20990101' AND + /* Exclude future dates - shouldn't be any, but you never know! */ + partition_id <= FORMAT_DATE('%Y%m%d', CURRENT_DATE()) + ) AND + /* The following should help make this even faster since above query is a little complex */ + /* We should never be more than 60 days old hopefully! */ + date >= DATE_SUB(CURRENT_DATE(), INTERVAL 61 DAY) AND + date <= CURRENT_DATE() +`) diff --git a/definitions/output/latest/requests.js b/definitions/output/latest/requests.js new file mode 100644 index 00000000..aa783412 --- /dev/null +++ b/definitions/output/latest/requests.js @@ -0,0 +1,30 @@ +const columns = descriptions.columns.requests + +publish('requests', { + type: 'view', + schema: 'latest', + columns: columns, +}).query(ctx => ` +SELECT + * +FROM + ${ctx.ref('crawl', 'requests')} +WHERE + date = ( + SELECT + PARSE_DATE('%Y%m%d', MAX(partition_id)) AS date + FROM + httparchive.crawl.INFORMATION_SCHEMA.PARTITIONS + WHERE + table_name = 'requests' AND + /* Only include actual dates in partition ids */ + partition_id >= '20250101' AND + partition_id < '20990101' AND + /* Exclude future dates - shouldn't be any, but you never know! */ + partition_id <= FORMAT_DATE('%Y%m%d', CURRENT_DATE()) + ) AND + /* The following should help make this even faster since above query is a little complex */ + /* We should never be more than 60 days old hopefully! */ + date >= DATE_SUB(CURRENT_DATE(), INTERVAL 61 DAY) AND + date <= CURRENT_DATE() +`) diff --git a/definitions/output/sample_data/pages_10k.js b/definitions/output/sample_data/pages_10k.js index 0d8b67e6..ffb65ab8 100644 --- a/definitions/output/sample_data/pages_10k.js +++ b/definitions/output/sample_data/pages_10k.js @@ -1,3 +1,5 @@ +const columns = descriptions.columns.pages + publish('pages_10k', { type: 'table', schema: 'sample_data', @@ -5,6 +7,7 @@ publish('pages_10k', { partitionBy: 'date', clusterBy: ['client', 'is_root_page', 'rank', 'page'] }, + columns: columns, tags: ['crawl_complete'] }).query(ctx => ` SELECT * diff --git a/definitions/output/sample_data/parsed_css_10k.js b/definitions/output/sample_data/parsed_css_10k.js index 1e9c7f47..4fe570a9 100644 --- a/definitions/output/sample_data/parsed_css_10k.js +++ b/definitions/output/sample_data/parsed_css_10k.js @@ -1,3 +1,5 @@ +const columns = descriptions.columns.parsed_css + publish('parsed_css_10k', { type: 'table', schema: 'sample_data', @@ -5,6 +7,7 @@ publish('parsed_css_10k', { partitionBy: 'date', clusterBy: ['client', 'is_root_page', 'rank', 'page'] }, + columns: columns, tags: ['crawl_complete'] }).query(ctx => ` SELECT * diff --git a/definitions/output/sample_data/requests_10k.js b/definitions/output/sample_data/requests_10k.js index 0fcf1192..444f0386 100644 --- a/definitions/output/sample_data/requests_10k.js +++ b/definitions/output/sample_data/requests_10k.js @@ -1,3 +1,5 @@ +const columns = descriptions.columns.requests + publish('requests_10k', { type: 'table', schema: 'sample_data', @@ -5,6 +7,7 @@ publish('requests_10k', { partitionBy: 'date', clusterBy: ['client', 'is_root_page', 'rank', 'type'] }, + columns: columns, tags: ['crawl_complete'] }).query(ctx => ` SELECT * diff --git a/includes/descriptions.js b/includes/descriptions.js new file mode 100644 index 00000000..4c8d0df8 --- /dev/null +++ b/includes/descriptions.js @@ -0,0 +1,86 @@ + +const columns = { + pages: { + date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl', + client: 'Test environment: desktop or mobile', + page: 'The URL of the page being tested', + is_root_page: 'Whether the page is the root of the origin', + root_page: 'The URL of the root page being tested, the origin followed by /', + rank: 'Site popularity rank, from CrUX', + wptid: 'ID of the WebPageTest results', + payload: 'JSON-encoded WebPageTest results for the page', + summary: 'JSON-encoded summarization of the page-level data', + custom_metrics: { + description: 'Custom metrics from WebPageTest', + columns: { + a11y: 'JSON-encoded A11Y metrics', + cms: 'JSON-encoded CMS detection', + cookies: 'JSON-encoded cookie metrics', + css_variables: 'JSON-encoded CSS variable metrics', + ecommerce: 'JSON-encoded ecommerce metrics', + element_count: 'JSON-encoded element count metrics', + javascript: 'JSON-encoded JavaScript metrics', + markup: 'JSON-encoded markup metrics', + media: 'JSON-encoded media metrics', + origin_trials: 'JSON-encoded origin trial metrics', + performance: 'JSON-encoded performance metrics', + privacy: 'JSON-encoded privacy metrics', + responsive_images: 'JSON-encoded responsive image metrics', + robots_txt: 'JSON-encoded robots.txt metrics', + security: 'JSON-encoded security metrics', + structured_data: 'JSON-encoded structured data metrics', + third_parties: 'JSON-encoded third-party metrics', + well_known: 'JSON-encoded well-known metrics', + wpt_bodies: 'JSON-encoded WebPageTest bodies', + other: 'JSON-encoded other custom metrics' + } + }, + lighthouse: 'JSON-encoded Lighthouse report', + features: 'Blink features detected at runtime (see https://chromestatus.com/features)', + technologies: 'Technologies detected at runtime (see https://www.wappalyzer.com/)', + metadata: 'Additional metadata about the test' + }, + requests: { + date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl', + client: 'Test environment: desktop or mobile', + page: 'The URL of the page being tested', + is_root_page: 'Whether the page is the root of the origin.', + root_page: 'The URL of the root page being tested', + rank: 'Site popularity rank, from CrUX', + url: 'The URL of the request', + is_main_document: 'Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects', + type: 'Simplified description of the type of resource (script, html, css, text, other, etc)', + index: 'The sequential 0-based index of the request', + payload: 'JSON-encoded WebPageTest result data for this request', + summary: 'JSON-encoded summarization of request data', + request_headers: { + description: 'Request headers', + columns: { + name: 'Request header name', + value: 'Request header value' + } + }, + response_headers: { + description: 'Response headers', + columns: { + name: 'Response header name', + value: 'Response header value' + } + }, + response_body: 'Text-based response body' + }, + parsed_css: { + date: 'YYYY-MM-DD format of the HTTP Archive monthly crawl', + client: 'Test environment: desktop or mobile', + page: 'The URL of the page being tested', + is_root_page: 'Whether the page is the root of the origin.', + root_page: 'The URL of the root page being tested', + rank: 'Site popularity rank, from CrUX', + url: 'The URL of the request', + css: 'The parsed CSS, in JSON format' + } +} + +module.exports = { + columns +}