Skip to content

Commit

Permalink
Media 2021 queries (#2583)
Browse files Browse the repository at this point in the history
* Media 2021 queries (#2144)

* Test query for 'encoding -> format use'

* Percent of pages with picture elements, and distribution of number of picture elements per page

* Simplify query, based on advice from Kevin Farrugia

* Basic script for counting no of images that use lazy loading

* Replaced JSON_EXTRACT_SCALAR with JSON_VALUE as the formor is depracated

* Adding the query to pull in details about the usage of , ,  and resposive dimension specification based on new custom metrics

* Copying and updating last year's media queries

* Adding a file from Performance chapter and tweaking it to report image dimension by industry vertical

* Updated the alt query to also count images that have  attribute

* Removing reference to "decode=lazy" from alt tag SQ; creating a new query for decode usage

* removing a condition that is not valid for this query

* correcting name of a output query field to indicate %

* Add extra totals columns, query July data

* Fix for loop; 'lazy' → 'async'

* Add average image query

* Deal with nulls and INT64 limits

* Adding a query to find the usage of image cdns

* Adding a SQL to count cross domain image requests

* Bits per pixel, by format

* A few fixes to BPP/format query

* Align on distribution percentiles

* Copying Colin's query for top media queries from 2019 SQL code base

* Chaged the description of the query

* Corrected the looping construct to make it more readable

* Two bytes and dimensions queries to rule them all

* Img elements with one and zero pixel resources
...excluded from other analysis

* Top aspect ratios query

* Remove unnecessary queries

* Portriat / aspect ratio / square query

* Comment out smallImageCount and bigImageCount constraint
...which was making it return no results?

* Most common sizes values

* Sizes implicit vs explicit, and parse errors

* .sql

* linting

* lint

* lint

* lint

* remaining queries

* lint

* 1x1 and 0x0

Co-authored-by: Eric Portis <[email protected]>
Co-authored-by: Akshay Ranganath <[email protected]>
  • Loading branch information
3 people authored Nov 24, 2021
1 parent 58c269d commit 5de0fe6
Show file tree
Hide file tree
Showing 40 changed files with 1,716 additions and 32 deletions.
1 change: 1 addition & 0 deletions sql/2021/media/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
Analysts: if helpful, you can use this README to give additional info about the queries.
-->

170 changes: 170 additions & 0 deletions sql/2021/media/bytes_and_dimensions.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
CREATE TEMPORARY FUNCTION getSrcsetInfo(responsiveImagesJsonString STRING)
RETURNS ARRAY<STRUCT<imgURL STRING, approximateResourceWidth INT64, approximateResourceHeight INT64, byteSize INT64, bitsPerPixel NUMERIC, isPixel BOOL, isDataURL BOOL, resourceFormat STRING>>
LANGUAGE js AS '''
function pithyType( { contentType, url } ) {
const subtypeMap = {
'svg+xml': 'svg',
'svgz': 'svg',
'jpeg': 'jpg',
'jfif': 'jpg',
'x-png': 'png',
'vnd.microsoft.icon': 'ico',
'x-icon': 'ico',
'jxr': 'jxr',
'vnd.ms-photo': 'jxr',
'hdp': 'jxr',
'wdp': 'jxr',
'jpf': 'jp2',
'jpx': 'jp2',
'jpm': 'jp2',
'mj2': 'jp2',
'x-jp2-container': 'jp2',
'x-jp2-codestream': 'jp2',
'x-jpeg2000-image': 'jp2',
'heic': 'heif',
'x-ms-bmp': 'bmp',
'x-pict': 'pict',
'tif': 'tiff',
'x-tif': 'tiff',
'x-tiff': 'tiff',
'vnd.mozilla.apng': 'apng',
// identities
'apng': 'apng',
'jpg': 'jpg',
'jp2': 'jp2',
'png': 'png',
'gif': 'gif',
'ico': 'ico',
'webp': 'webp',
'avif': 'avif',
'tiff': 'tiff',
'flif': 'flif',
'heif': 'heif',
'jxl': 'jxl',
'avif-sequence': 'avif-sequence', // keep separate from single frames...
'heic-sequence': 'heic-sequence',
'bmp': 'bmp',
'pict': 'pict'
};
function normalizeSubtype( subtype ) {
if ( subtypeMap[ subtype ] ) {
return subtypeMap[ subtype ];
}
return 'unknown'; // switch between:
// `subtype`
// to see everything, check if there's anything else worth capturing
// `'unknown'`
// to make results manageable
}

// if it's a data url, take the mime type from there, done.
if ( url &&
typeof url === "string" ) {
const match = url.toLowerCase().match( /^data:image\\/([\\w\\-\\.\\+]+)/ );
if ( match && match[ 1 ] ) {
return normalizeSubtype( match[ 1 ] );
}
}
// if we get a content-type header, use it!
if ( contentType &&
typeof contentType === "string" ) {
const match = contentType.toLowerCase().match( /image\\/([\\w\\-\\.\\+]+)/ );
if ( match && match[ 1 ] ) {
return normalizeSubtype( match[ 1 ] );
}
}
// otherwise fall back to extension in the URL
if ( url &&
typeof url === "string" ) {
const splitOnSlashes = url.split("/");
if ( splitOnSlashes.length > 1 ) {
const afterLastSlash = splitOnSlashes[ splitOnSlashes.length - 1 ],
splitOnDots = afterLastSlash.split(".");
if ( splitOnDots.length > 1 ) {
return normalizeSubtype(
splitOnDots[ splitOnDots.length - 1 ]
.toLowerCase()
.replace( /^(\\w+)[\\?\\&\\#].*/, '$1' ) // strip query params
);
}
}
}
// otherwise throw up our hands
return 'unknown';
}
const parsed = JSON.parse( responsiveImagesJsonString );
if ( parsed && parsed.map ) {
const dataRegEx = new RegExp('^data');
return parsed.map( d => ({
imgURL: d.url,
approximateResourceWidth: Math.floor( d.approximateResourceWidth || 0 ),
approximateResourceHeight: Math.floor( d.approximateResourceHeight || 0 ),
byteSize: Math.floor( d.byteSize || 0 ),
bitsPerPixel: parseFloat( d.bitsPerPixel || 0 ),
isPixel: d.approximateResourceWidth == 1 && d.approximateResourceHeight == 1,
isDataURL: dataRegEx.test(d.url),
resourceFormat: pithyType({ contentType: d.mimeType, url: d.url })
}) );
}
''';

WITH imgs AS (
SELECT
_TABLE_SUFFIX AS client,
url AS pageURL,
imgURL,
approximateResourceWidth,
approximateResourceHeight,
byteSize,
bitsPerPixel,
isPixel,
isDataURL,
( approximateResourceWidth * approximateResourceHeight ) / 1000000 AS megapixels,
( approximateResourceWidth / approximateResourceHeight ) AS aspectRatio,
resourceFormat
FROM
`httparchive.pages.2021_07_01_*`,
UNNEST(getSrcsetInfo(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images' ), '$.responsive-images')))
),

percentiles AS (
SELECT
client,
APPROX_QUANTILES(approximateResourceWidth, 1000) AS resourceWidthPercentiles,
APPROX_QUANTILES(approximateResourceHeight, 1000) AS resourceHeightPercentiles,
APPROX_QUANTILES(aspectRatio, 1000) AS aspectRatioPercentiles,
APPROX_QUANTILES(megapixels, 1000) AS megapixelsPercentiles,
APPROX_QUANTILES(byteSize, 1000) AS byteSizePercentiles,
APPROX_QUANTILES(bitsPerPixel, 1000) AS bitsPerPixelPercentiles,
COUNT(0) AS imgCount
FROM
imgs
WHERE
approximateResourceWidth > 1 AND
approximateResourceHeight > 1
GROUP BY
client
)

SELECT
percentile,
client,
imgCount,
resourceWidthPercentiles[OFFSET(percentile * 10)] AS resourceWidth,
resourceHeightPercentiles[OFFSET(percentile * 10)] AS resourceHeight,
aspectRatioPercentiles[OFFSET(percentile * 10)] AS aspectRatio,
megapixelsPercentiles[OFFSET(percentile * 10)] AS megapixels,
byteSizePercentiles[OFFSET(percentile * 10)] AS byteSize,
bitsPerPixelPercentiles[OFFSET(percentile * 10)] AS bitsPerPixel
FROM
percentiles,
UNNEST([0, 10, 25, 50, 75, 90, 100]) AS percentile
ORDER BY
imgCount DESC,
percentile
173 changes: 173 additions & 0 deletions sql/2021/media/bytes_and_dimensions_by_format.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
CREATE TEMPORARY FUNCTION getSrcsetInfo(responsiveImagesJsonString STRING)
RETURNS ARRAY<STRUCT<imgURL STRING, approximateResourceWidth INT64, approximateResourceHeight INT64, byteSize INT64, bitsPerPixel NUMERIC, isPixel BOOL, isDataURL BOOL, resourceFormat STRING>>
LANGUAGE js AS '''
function pithyType( { contentType, url } ) {
const subtypeMap = {
'svg+xml': 'svg',
'svgz': 'svg',
'jpeg': 'jpg',
'jfif': 'jpg',
'x-png': 'png',
'vnd.microsoft.icon': 'ico',
'x-icon': 'ico',
'jxr': 'jxr',
'vnd.ms-photo': 'jxr',
'hdp': 'jxr',
'wdp': 'jxr',
'jpf': 'jp2',
'jpx': 'jp2',
'jpm': 'jp2',
'mj2': 'jp2',
'x-jp2-container': 'jp2',
'x-jp2-codestream': 'jp2',
'x-jpeg2000-image': 'jp2',
'heic': 'heif',
'x-ms-bmp': 'bmp',
'x-pict': 'pict',
'tif': 'tiff',
'x-tif': 'tiff',
'x-tiff': 'tiff',
'vnd.mozilla.apng': 'apng',
// identities
'apng': 'apng',
'jpg': 'jpg',
'jp2': 'jp2',
'png': 'png',
'gif': 'gif',
'ico': 'ico',
'webp': 'webp',
'avif': 'avif',
'tiff': 'tiff',
'flif': 'flif',
'heif': 'heif',
'jxl': 'jxl',
'avif-sequence': 'avif-sequence', // keep separate from single frames...
'heic-sequence': 'heic-sequence',
'bmp': 'bmp',
'pict': 'pict'
};
function normalizeSubtype( subtype ) {
if ( subtypeMap[ subtype ] ) {
return subtypeMap[ subtype ];
}
return 'unknown'; // switch between:
// `subtype`
// to see everything, check if there's anything else worth capturing
// `'unknown'`
// to make results manageable
}

// if it's a data url, take the mime type from there, done.
if ( url &&
typeof url === "string" ) {
const match = url.toLowerCase().match( /^data:image\\/([\\w\\-\\.\\+]+)/ );
if ( match && match[ 1 ] ) {
return normalizeSubtype( match[ 1 ] );
}
}
// if we get a content-type header, use it!
if ( contentType &&
typeof contentType === "string" ) {
const match = contentType.toLowerCase().match( /image\\/([\\w\\-\\.\\+]+)/ );
if ( match && match[ 1 ] ) {
return normalizeSubtype( match[ 1 ] );
}
}
// otherwise fall back to extension in the URL
if ( url &&
typeof url === "string" ) {
const splitOnSlashes = url.split("/");
if ( splitOnSlashes.length > 1 ) {
const afterLastSlash = splitOnSlashes[ splitOnSlashes.length - 1 ],
splitOnDots = afterLastSlash.split(".");
if ( splitOnDots.length > 1 ) {
return normalizeSubtype(
splitOnDots[ splitOnDots.length - 1 ]
.toLowerCase()
.replace( /^(\\w+)[\\?\\&\\#].*/, '$1' ) // strip query params
);
}
}
}
// otherwise throw up our hands
return 'unknown';
}
const parsed = JSON.parse( responsiveImagesJsonString );
if ( parsed && parsed.map ) {
const dataRegEx = new RegExp('^data');
return parsed.map( d => ({
imgURL: d.url,
approximateResourceWidth: Math.floor( d.approximateResourceWidth || 0 ),
approximateResourceHeight: Math.floor( d.approximateResourceHeight || 0 ),
byteSize: Math.floor( d.byteSize || 0 ),
bitsPerPixel: parseFloat( d.bitsPerPixel || 0 ),
isPixel: d.approximateResourceWidth == 1 && d.approximateResourceHeight == 1,
isDataURL: dataRegEx.test(d.url),
resourceFormat: pithyType({ contentType: d.mimeType, url: d.url })
}) );
}
''';

WITH imgs AS (
SELECT
_TABLE_SUFFIX AS client,
url AS pageURL,
imgURL,
approximateResourceWidth,
approximateResourceHeight,
byteSize,
bitsPerPixel,
isPixel,
isDataURL,
( approximateResourceWidth * approximateResourceHeight ) / 1000000 AS megapixels,
( approximateResourceWidth / approximateResourceHeight ) AS aspectRatio,
resourceFormat
FROM
`httparchive.pages.2021_07_01_*`,
UNNEST(getSrcsetInfo(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images')))
),

percentiles AS (
SELECT
client,
resourceFormat,
APPROX_QUANTILES(approximateResourceWidth, 1000) AS resourceWidthPercentiles,
APPROX_QUANTILES(approximateResourceHeight, 1000) AS resourceHeightPercentiles,
APPROX_QUANTILES(aspectRatio, 1000) AS aspectRatioPercentiles,
APPROX_QUANTILES(megapixels, 1000) AS megapixelsPercentiles,
APPROX_QUANTILES(byteSize, 1000) AS byteSizePercentiles,
APPROX_QUANTILES(bitsPerPixel, 1000) AS bitsPerPixelPercentiles,
COUNT(0) AS imgCount
FROM
imgs
WHERE
approximateResourceWidth > 1 AND
approximateResourceHeight > 1
GROUP BY
client,
resourceFormat
)

SELECT
percentile,
client,
resourceFormat,
imgCount,
resourceWidthPercentiles[OFFSET(percentile * 10)] AS resourceWidth,
resourceHeightPercentiles[OFFSET(percentile * 10)] AS resourceHeight,
aspectRatioPercentiles[OFFSET(percentile * 10)] AS aspectRatio,
megapixelsPercentiles[OFFSET(percentile * 10)] AS megapixels,
byteSizePercentiles[OFFSET(percentile * 10)] AS byteSize,
bitsPerPixelPercentiles[OFFSET(percentile * 10)] AS bitsPerPixel
FROM
percentiles,
UNNEST([0, 10, 25, 50, 75, 90, 100]) AS percentile
ORDER BY
imgCount DESC,
percentile
44 changes: 44 additions & 0 deletions sql/2021/media/image_0x0.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
CREATE TEMPORARY FUNCTION getPixelInfo(responsiveImagesJsonString STRING)
RETURNS ARRAY<STRUCT<imgURL STRING, approximateResourceWidth INT64, approximateResourceHeight INT64, byteSize INT64, isPixel BOOL, isDataURL BOOL>>
LANGUAGE js AS '''
const parsed = JSON.parse(responsiveImagesJsonString);
if (parsed && parsed.map) {
const dataRegEx = new RegExp('^data');
return parsed.map(d => ({
isPixel: d.approximateResourceWidth == 0 && d.approximateResourceHeight == 0,
isDataURL: dataRegEx.test(d.url)
}));
}
''';

WITH imgs AS (
SELECT
_TABLE_SUFFIX AS client,
isPixel,
isDataURL
FROM
`httparchive.pages.2021_07_01_*`,
UNNEST(getPixelInfo(JSON_QUERY(JSON_VALUE(payload, '$._responsive_images'), '$.responsive-images')))
),

counts AS (
SELECT
client,
COUNT(0) AS total_imgs,
COUNTIF(isPixel) AS zero_pixel_imgs,
COUNTIF(isPixel AND isDataURL) AS zero_pixel_data_urls
FROM
imgs
GROUP BY
client
)

SELECT
client,
total_imgs,
zero_pixel_imgs,
zero_pixel_data_urls,
SAFE_DIVIDE(zero_pixel_imgs, total_imgs) AS pct_zero_pixel_imgs,
SAFE_DIVIDE(zero_pixel_data_urls, total_imgs) AS pct_zero_pixel_data_urls
FROM
counts
Loading

0 comments on commit 5de0fe6

Please sign in to comment.