Media 2021 queries (#2583)

* Media 2021 queries (#2144) * Test query for 'encoding -> format use' * Percent of pages with picture elements, and distribution of number of picture elements per page * Simplify query, based on advice from Kevin Farrugia * Basic script for counting no of images that use lazy loading * Replaced JSON_EXTRACT_SCALAR with JSON_VALUE as the formor is depracated * Adding the query to pull in details about the usage of , , and resposive dimension specification based on new custom metrics * Copying and updating last year's media queries * Adding a file from Performance chapter and tweaking it to report image dimension by industry vertical * Updated the alt query to also count images that have attribute * Removing reference to "decode=lazy" from alt tag SQ; creating a new query for decode usage * removing a condition that is not valid for this query * correcting name of a output query field to indicate % * Add extra totals columns, query July data * Fix for loop; 'lazy' → 'async' * Add average image query * Deal with nulls and INT64 limits * Adding a query to find the usage of image cdns * Adding a SQL to count cross domain image requests * Bits per pixel, by format * A few fixes to BPP/format query * Align on distribution percentiles * Copying Colin's query for top media queries from 2019 SQL code base * Chaged the description of the query * Corrected the looping construct to make it more readable * Two bytes and dimensions queries to rule them all * Img elements with one and zero pixel resources ...excluded from other analysis * Top aspect ratios query * Remove unnecessary queries * Portriat / aspect ratio / square query * Comment out smallImageCount and bigImageCount constraint ...which was making it return no results? * Most common sizes values * Sizes implicit vs explicit, and parse errors * .sql * linting * lint * lint * lint * remaining queries * lint * 1x1 and 0x0 Co-authored-by: Eric Portis <[email protected]> Co-authored-by: Akshay Ranganath <[email protected]>
HTTPArchive · Nov 24, 2021 · 5de0fe6 · 5de0fe6
1 parent 58c269d
commit 5de0fe6
Show file tree

Hide file tree

Showing 40 changed files with 1,716 additions and 32 deletions.
diff --git a/sql/2021/media/README.md b/sql/2021/media/README.md
@@ -8,3 +8,4 @@
 
   Analysts: if helpful, you can use this README to give additional info about the queries.
 -->
+
diff --git a/sql/2021/media/bytes_and_dimensions.sql b/sql/2021/media/bytes_and_dimensions.sql
@@ -0,0 +1,170 @@
+CREATE TEMPORARY FUNCTION getSrcsetInfo(responsiveImagesJsonString STRING)
+RETURNS ARRAY<STRUCT<imgURL STRING, approximateResourceWidth INT64, approximateResourceHeight INT64, byteSize INT64, bitsPerPixel NUMERIC, isPixel BOOL, isDataURL BOOL, resourceFormat STRING>>
+LANGUAGE js AS '''
+
+function pithyType( { contentType, url } ) {
+  const subtypeMap = {
+      'svg+xml': 'svg',
+      'svgz': 'svg',
+      'jpeg': 'jpg',
+      'jfif': 'jpg',
+      'x-png': 'png',
+      'vnd.microsoft.icon': 'ico',
+      'x-icon': 'ico',
+      'jxr': 'jxr',
+      'vnd.ms-photo': 'jxr',
+      'hdp': 'jxr',
+      'wdp': 'jxr',
+      'jpf': 'jp2',
+      'jpx': 'jp2',
+      'jpm': 'jp2',
+      'mj2': 'jp2',
+      'x-jp2-container': 'jp2',
+      'x-jp2-codestream': 'jp2',
+      'x-jpeg2000-image': 'jp2',
+      'heic': 'heif',
+      'x-ms-bmp': 'bmp',
+      'x-pict': 'pict',
+      'tif': 'tiff',
+      'x-tif': 'tiff',
+      'x-tiff': 'tiff',
+      'vnd.mozilla.apng': 'apng',
+      // identities
+      'apng': 'apng',
+      'jpg': 'jpg',
+      'jp2': 'jp2',
+      'png': 'png',
+      'gif': 'gif',
+      'ico': 'ico',
+      'webp': 'webp',
+      'avif': 'avif',
+      'tiff': 'tiff',
+      'flif': 'flif',
+      'heif': 'heif',
+      'jxl': 'jxl',
+      'avif-sequence': 'avif-sequence', // keep separate from single frames...
+      'heic-sequence': 'heic-sequence',
+      'bmp': 'bmp',
+      'pict': 'pict'
+  };
+
+  function normalizeSubtype( subtype ) {
+      if ( subtypeMap[ subtype ] ) {
+          return subtypeMap[ subtype ];
+      }
+      return 'unknown'; // switch between:
+                        // `subtype`
+                        //     to see everything, check if there's anything else worth capturing
+                        // `'unknown'`
+                        //     to make results manageable
+  }
+
+  // if it's a data url, take the mime type from there, done.
+  if ( url &&
+       typeof url === "string" ) {
+      const match = url.toLowerCase().match( /^data:image\\/([\\w\\-\\.\\+]+)/ );
+      if ( match && match[ 1 ] ) {
+          return normalizeSubtype( match[ 1 ] );
+      }
+  }
+
+  // if we get a content-type header, use it!
+  if ( contentType &&
+       typeof contentType === "string" ) {
+      const match = contentType.toLowerCase().match( /image\\/([\\w\\-\\.\\+]+)/ );
+      if ( match && match[ 1 ] ) {
+          return normalizeSubtype( match[ 1 ] );
+      }
+  }
+
+  // otherwise fall back to extension in the URL
+  if ( url &&
+       typeof url === "string" ) {
+      const splitOnSlashes = url.split("/");
+      if ( splitOnSlashes.length > 1 ) {
+          const afterLastSlash = splitOnSlashes[ splitOnSlashes.length - 1 ],
+                splitOnDots = afterLastSlash.split(".");
+          if ( splitOnDots.length > 1 ) {
+              return normalizeSubtype(
+                  splitOnDots[ splitOnDots.length - 1 ]
+                    .toLowerCase()
+                    .replace( /^(\\w+)[\\?\\&\\#].*/, '$1' ) // strip query params
+              );
+          }
+      }
+  }
+
+  // otherwise throw up our hands
+  return 'unknown';
+  }
+
+	const parsed = JSON.parse( responsiveImagesJsonString );
+	if ( parsed && parsed.map ) {
+        const dataRegEx = new RegExp('^data');
+		return parsed.map( d => ({
+            imgURL: d.url,
+            approximateResourceWidth: Math.floor( d.approximateResourceWidth || 0 ),
+            approximateResourceHeight: Math.floor( d.approximateResourceHeight || 0 ),
+            byteSize: Math.floor( d.byteSize || 0 ),
+            bitsPerPixel: parseFloat( d.bitsPerPixel || 0 ),
+            isPixel: d.approximateResourceWidth == 1 && d.approximateResourceHeight == 1,
+            isDataURL: dataRegEx.test(d.url),
+            resourceFormat: pithyType({ contentType: d.mimeType, url: d.url })
+		}) );
+    }
+''';
+
+WITH imgs AS (
+  SELECT
+    _TABLE_SUFFIX AS client,
+    url AS pageURL,
+    imgURL,
+    approximateResourceWidth,
+    approximateResourceHeight,
+    byteSize,
+    bitsPerPixel,
+    isPixel,
+    isDataURL,
+    ( approximateResourceWidth * approximateResourceHeight ) / 1000000 AS megapixels,
+    ( approximateResourceWidth / approximateResourceHeight ) AS aspectRatio,
+    resourceFormat
+  FROM
+    `httparchive.pages.2021_07_01_*`,
+    UNNEST(getSrcsetInfo(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images' ), '$.responsive-images')))
+),
+
+percentiles AS (
+  SELECT
+    client,
+    APPROX_QUANTILES(approximateResourceWidth, 1000) AS resourceWidthPercentiles,
+    APPROX_QUANTILES(approximateResourceHeight, 1000) AS resourceHeightPercentiles,
+    APPROX_QUANTILES(aspectRatio, 1000) AS aspectRatioPercentiles,
+    APPROX_QUANTILES(megapixels, 1000) AS megapixelsPercentiles,
+    APPROX_QUANTILES(byteSize, 1000) AS byteSizePercentiles,
+    APPROX_QUANTILES(bitsPerPixel, 1000) AS bitsPerPixelPercentiles,
+    COUNT(0) AS imgCount
+  FROM
+    imgs
+  WHERE
+    approximateResourceWidth > 1 AND
+    approximateResourceHeight > 1
+  GROUP BY
+    client
+)
+
+SELECT
+  percentile,
+  client,
+  imgCount,
+  resourceWidthPercentiles[OFFSET(percentile * 10)] AS resourceWidth,
+  resourceHeightPercentiles[OFFSET(percentile * 10)] AS resourceHeight,
+  aspectRatioPercentiles[OFFSET(percentile * 10)] AS aspectRatio,
+  megapixelsPercentiles[OFFSET(percentile * 10)] AS megapixels,
+  byteSizePercentiles[OFFSET(percentile * 10)] AS byteSize,
+  bitsPerPixelPercentiles[OFFSET(percentile * 10)] AS bitsPerPixel
+FROM
+  percentiles,
+  UNNEST([0, 10, 25, 50, 75, 90, 100]) AS percentile
+ORDER BY
+  imgCount DESC,
+  percentile
diff --git a/sql/2021/media/bytes_and_dimensions_by_format.sql b/sql/2021/media/bytes_and_dimensions_by_format.sql
@@ -0,0 +1,173 @@
+CREATE TEMPORARY FUNCTION getSrcsetInfo(responsiveImagesJsonString STRING)
+RETURNS ARRAY<STRUCT<imgURL STRING, approximateResourceWidth INT64, approximateResourceHeight INT64, byteSize INT64, bitsPerPixel NUMERIC, isPixel BOOL, isDataURL BOOL, resourceFormat STRING>>
+LANGUAGE js AS '''
+
+function pithyType( { contentType, url } ) {
+  const subtypeMap = {
+      'svg+xml': 'svg',
+      'svgz': 'svg',
+      'jpeg': 'jpg',
+      'jfif': 'jpg',
+      'x-png': 'png',
+      'vnd.microsoft.icon': 'ico',
+      'x-icon': 'ico',
+      'jxr': 'jxr',
+      'vnd.ms-photo': 'jxr',
+      'hdp': 'jxr',
+      'wdp': 'jxr',
+      'jpf': 'jp2',
+      'jpx': 'jp2',
+      'jpm': 'jp2',
+      'mj2': 'jp2',
+      'x-jp2-container': 'jp2',
+      'x-jp2-codestream': 'jp2',
+      'x-jpeg2000-image': 'jp2',
+      'heic': 'heif',
+      'x-ms-bmp': 'bmp',
+      'x-pict': 'pict',
+      'tif': 'tiff',
+      'x-tif': 'tiff',
+      'x-tiff': 'tiff',
+      'vnd.mozilla.apng': 'apng',
+      // identities
+      'apng': 'apng',
+      'jpg': 'jpg',
+      'jp2': 'jp2',
+      'png': 'png',
+      'gif': 'gif',
+      'ico': 'ico',
+      'webp': 'webp',
+      'avif': 'avif',
+      'tiff': 'tiff',
+      'flif': 'flif',
+      'heif': 'heif',
+      'jxl': 'jxl',
+      'avif-sequence': 'avif-sequence', // keep separate from single frames...
+      'heic-sequence': 'heic-sequence',
+      'bmp': 'bmp',
+      'pict': 'pict'
+  };
+
+  function normalizeSubtype( subtype ) {
+      if ( subtypeMap[ subtype ] ) {
+          return subtypeMap[ subtype ];
+      }
+      return 'unknown'; // switch between:
+                        // `subtype`
+                        //     to see everything, check if there's anything else worth capturing
+                        // `'unknown'`
+                        //     to make results manageable
+  }
+
+  // if it's a data url, take the mime type from there, done.
+  if ( url &&
+       typeof url === "string" ) {
+      const match = url.toLowerCase().match( /^data:image\\/([\\w\\-\\.\\+]+)/ );
+      if ( match && match[ 1 ] ) {
+          return normalizeSubtype( match[ 1 ] );
+      }
+  }
+
+  // if we get a content-type header, use it!
+  if ( contentType &&
+       typeof contentType === "string" ) {
+      const match = contentType.toLowerCase().match( /image\\/([\\w\\-\\.\\+]+)/ );
+      if ( match && match[ 1 ] ) {
+          return normalizeSubtype( match[ 1 ] );
+      }
+  }
+
+  // otherwise fall back to extension in the URL
+  if ( url &&
+       typeof url === "string" ) {
+      const splitOnSlashes = url.split("/");
+      if ( splitOnSlashes.length > 1 ) {
+          const afterLastSlash = splitOnSlashes[ splitOnSlashes.length - 1 ],
+                splitOnDots = afterLastSlash.split(".");
+          if ( splitOnDots.length > 1 ) {
+              return normalizeSubtype(
+                  splitOnDots[ splitOnDots.length - 1 ]
+                    .toLowerCase()
+                    .replace( /^(\\w+)[\\?\\&\\#].*/, '$1' ) // strip query params
+              );
+          }
+      }
+  }
+
+  // otherwise throw up our hands
+  return 'unknown';
+  }
+
+	const parsed = JSON.parse( responsiveImagesJsonString );
+	if ( parsed && parsed.map ) {
+        const dataRegEx = new RegExp('^data');
+		return parsed.map( d => ({
+            imgURL: d.url,
+            approximateResourceWidth: Math.floor( d.approximateResourceWidth || 0 ),
+            approximateResourceHeight: Math.floor( d.approximateResourceHeight || 0 ),
+            byteSize: Math.floor( d.byteSize || 0 ),
+            bitsPerPixel: parseFloat( d.bitsPerPixel || 0 ),
+            isPixel: d.approximateResourceWidth == 1 && d.approximateResourceHeight == 1,
+            isDataURL: dataRegEx.test(d.url),
+            resourceFormat: pithyType({ contentType: d.mimeType, url: d.url })
+		}) );
+    }
+''';
+
+WITH imgs AS (
+  SELECT
+    _TABLE_SUFFIX AS client,
+    url AS pageURL,
+    imgURL,
+    approximateResourceWidth,
+    approximateResourceHeight,
+    byteSize,
+    bitsPerPixel,
+    isPixel,
+    isDataURL,
+    ( approximateResourceWidth * approximateResourceHeight ) / 1000000 AS megapixels,
+    ( approximateResourceWidth / approximateResourceHeight ) AS aspectRatio,
+    resourceFormat
+  FROM
+    `httparchive.pages.2021_07_01_*`,
+    UNNEST(getSrcsetInfo(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images')))
+),
+
+percentiles AS (
+  SELECT
+    client,
+    resourceFormat,
+    APPROX_QUANTILES(approximateResourceWidth, 1000) AS resourceWidthPercentiles,
+    APPROX_QUANTILES(approximateResourceHeight, 1000) AS resourceHeightPercentiles,
+    APPROX_QUANTILES(aspectRatio, 1000) AS aspectRatioPercentiles,
+    APPROX_QUANTILES(megapixels, 1000) AS megapixelsPercentiles,
+    APPROX_QUANTILES(byteSize, 1000) AS byteSizePercentiles,
+    APPROX_QUANTILES(bitsPerPixel, 1000) AS bitsPerPixelPercentiles,
+    COUNT(0) AS imgCount
+  FROM
+    imgs
+  WHERE
+    approximateResourceWidth > 1 AND
+    approximateResourceHeight > 1
+  GROUP BY
+    client,
+    resourceFormat
+)
+
+SELECT
+  percentile,
+  client,
+  resourceFormat,
+  imgCount,
+  resourceWidthPercentiles[OFFSET(percentile * 10)] AS resourceWidth,
+  resourceHeightPercentiles[OFFSET(percentile * 10)] AS resourceHeight,
+  aspectRatioPercentiles[OFFSET(percentile * 10)] AS aspectRatio,
+  megapixelsPercentiles[OFFSET(percentile * 10)] AS megapixels,
+  byteSizePercentiles[OFFSET(percentile * 10)] AS byteSize,
+  bitsPerPixelPercentiles[OFFSET(percentile * 10)] AS bitsPerPixel
+FROM
+  percentiles,
+  UNNEST([0, 10, 25, 50, 75, 90, 100]) AS percentile
+ORDER BY
+  imgCount DESC,
+  percentile
diff --git a/sql/2021/media/image_0x0.sql b/sql/2021/media/image_0x0.sql
@@ -0,0 +1,44 @@
+CREATE TEMPORARY FUNCTION getPixelInfo(responsiveImagesJsonString STRING)
+RETURNS ARRAY<STRUCT<imgURL STRING, approximateResourceWidth INT64, approximateResourceHeight INT64, byteSize INT64, isPixel BOOL, isDataURL BOOL>>
+LANGUAGE js AS '''
+const parsed = JSON.parse(responsiveImagesJsonString);
+if (parsed && parsed.map) {
+  const dataRegEx = new RegExp('^data');
+  return parsed.map(d => ({
+    isPixel: d.approximateResourceWidth == 0 && d.approximateResourceHeight == 0,
+    isDataURL: dataRegEx.test(d.url)
+  }));
+}
+''';
+
+WITH imgs AS (
+  SELECT
+    _TABLE_SUFFIX AS client,
+    isPixel,
+    isDataURL
+  FROM
+    `httparchive.pages.2021_07_01_*`,
+    UNNEST(getPixelInfo(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images')))
+),
+
+counts AS (
+  SELECT
+    client,
+    COUNT(0) AS total_imgs,
+    COUNTIF(isPixel) AS zero_pixel_imgs,
+    COUNTIF(isPixel AND isDataURL) AS zero_pixel_data_urls
+  FROM
+    imgs
+  GROUP BY
+    client
+)
+
+SELECT
+  client,
+  total_imgs,
+  zero_pixel_imgs,
+  zero_pixel_data_urls,
+  SAFE_DIVIDE(zero_pixel_imgs, total_imgs) AS pct_zero_pixel_imgs,
+  SAFE_DIVIDE(zero_pixel_data_urls, total_imgs) AS pct_zero_pixel_data_urls
+FROM
+  counts
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,3 +8,4 @@

		Analysts: if helpful, you can use this README to give additional info about the queries.
		-->