Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding queries from past year #2

Merged
merged 6 commits into from
Nov 1, 2021
37 changes: 37 additions & 0 deletions sql/2021/media/big_non_custom_metrics.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#standardSQL
# non custom metrics sql that uses regexp on response bodies
# img src vs data-uri
# count rel=preconnect
# video with src
# video with source
# figure
# figure with figcaption

SELECT
client,
COUNTIF(has_img_data_uri) / COUNTIF(has_img_src) AS pages_with_img_data_uri_pct,
COUNTIF(rel_preconnect) / COUNT(0) AS pages_with_rel_preconnect_pct,
COUNTIF(has_video_src) / COUNT(0) AS pages_with_video_src_pct,
COUNTIF(has_video_source) / COUNT(0) AS pages_with_video_source_pct,
COUNTIF(has_figure) / COUNT(0) AS pages_with_figure_pct,
COUNTIF(has_figcaption) / COUNT(0) AS pages_with_figcaption_pct
FROM (
SELECT
client,
page,
REGEXP_CONTAINS(body, r'(?i)<img[^><]*src=(?:\"|\')*data[:]image/(?:\"|\')*[^><]*>') AS has_img_data_uri,
REGEXP_CONTAINS(body, r'(?i)<img[^><]*src=[^><]*>') AS has_img_src,
REGEXP_CONTAINS(body, r'(?i)<link[^><]*rel=(?:\"|\')*preconnect/(?:\"|\')*[^><]*>') AS rel_preconnect,
REGEXP_CONTAINS(body, r'(?i)<video[^><]*src=[^><]*>') AS has_video_src,
REGEXP_CONTAINS(body, r'(?i)<video[^><]*>.*?<source[^><]*>.*?</video>') AS has_video_source,
REGEXP_CONTAINS(body, r'(?i)<figure[^><]*>') AS has_figure,
REGEXP_CONTAINS(body, r'(?i)<figure[^><]*>.*?<figcaption[^><]*>.*?</figure>') AS has_figcaption
FROM
`httparchive.almanac.summary_response_bodies`
WHERE
date = '2021-08-01' AND
firstHtml)
GROUP BY
client
ORDER BY
client
89 changes: 89 additions & 0 deletions sql/2021/media/bytes_per_pixel.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#standardSQL
# Bytes per pixel per image format
CREATE TEMPORARY FUNCTION getImages(payload STRING)
RETURNS ARRAY<STRUCT<url STRING, naturalWidth INT64, naturalHeight INT64, width INT64, height INT64>> LANGUAGE js AS '''
try {
var $ = JSON.parse(payload);
var images = JSON.parse($._Images) || [];
return images.map(({url, naturalHeight, naturalWidth, width, height}) => ({url, naturalHeight: Number.parseInt(naturalHeight) || 0, naturalWidth: Number.parseInt(naturalWidth) || 0, width: Number.parseInt(width) || 0, height: Number.parseInt(height) || 0}));
} catch (e) {}
return null;
''';

SELECT
a.client,
imageType,
count(0) AS count,
APPROX_QUANTILES(if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 1000)[OFFSET(100)] AS pixels_p10,
APPROX_QUANTILES(if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 1000)[OFFSET(250)] AS pixels_p25,
APPROX_QUANTILES(if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 1000)[OFFSET(500)] AS pixels_p50,
APPROX_QUANTILES(if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 1000)[OFFSET(750)] AS pixels_p75,
APPROX_QUANTILES(if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 1000)[OFFSET(900)] AS pixels_p90,
APPROX_QUANTILES(bytes, 1000)[OFFSET(100)] AS bytes_p10,
APPROX_QUANTILES(bytes, 1000)[OFFSET(250)] AS bytes_p25,
APPROX_QUANTILES(bytes, 1000)[OFFSET(500)] AS bytes_p50,
APPROX_QUANTILES(bytes, 1000)[OFFSET(750)] AS bytes_p75,
APPROX_QUANTILES(bytes, 1000)[OFFSET(900)] AS bytes_p90,
APPROX_QUANTILES(round(bytes/if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 4), 1000)[OFFSET(100)] AS bpp_p10,
APPROX_QUANTILES(round(bytes/if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 4), 1000)[OFFSET(250)] AS bpp_p25,
APPROX_QUANTILES(round(bytes/if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 4), 1000)[OFFSET(500)] AS bpp_p50,
APPROX_QUANTILES(round(bytes/if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 4), 1000)[OFFSET(750)] AS bpp_p75,
APPROX_QUANTILES(round(bytes/if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 4), 1000)[OFFSET(900)] AS bpp_p90
FROM
(
SELECT
_TABLE_SUFFIX AS client,
p.url AS page,
image.url AS url,
image.width AS width,
image.height AS height,
image.naturalWidth AS naturalWidth,
image.naturalHeight AS naturalHeight,
IFNULL(image.width, 0) * IFNULL(image.height, 0) AS pixels,
IFNULL(image.naturalWidth, 0) * IFNULL(image.naturalHeight, 0) AS naturalPixels
FROM
`httparchive.pages.2021_08_01_*` p
CROSS JOIN UNNEST(getImages(payload)) AS image
WHERE
image.naturalHeight > 0 AND
image.naturalWidth > 0
) a
LEFT JOIN
(
SELECT
client,
page,
url,
NULLIF(if(REGEXP_CONTAINS(mimetype, r'(?i)^application|^applicaton|^binary|^image$|^multipart|^media|^$|^text/html|^text/plain|\d|array|unknown|undefined|\*|string|^img|^images|^text|\%2f|\(|ipg$|jpe$|jfif'), format, LOWER(regexp_replace(regexp_replace(mimetype, r'(?is).*image[/\\](?:x-)?|[\."]|[ +,;]+.*$', ''), r'(?i)pjpeg|jpeg', 'jpg'))), '') AS imageType,
respSize AS bytes
FROM `httparchive.almanac.requests`

WHERE
# many 404s and redirects show up as image/gif
status = 200 AND

# we are trying to catch images. WPO populates the format for media but it uses a file extension guess.
#So we exclude mimetypes that aren't image or where the format couldn't be guessed by WPO
(format <> '' OR mimetype LIKE 'image%') AND

# many image/gifs are really beacons with 1x1 pixel, but svgs can get caught in the mix
(respSize > 1500 OR REGEXP_CONTAINS(mimetype, r'svg')) AND

# strip favicon requests
format <> 'ico' AND

# strip video mimetypes and other favicons
NOT REGEXP_CONTAINS(mimetype, r'video|ico')
) b
ON (b.client = a.client AND a.page = b.page AND a.url = b.url)

WHERE
naturalPixels > 0 AND
bytes > 0 AND
imageType IN ('jpg', 'png', 'webp', 'gif', 'svg')
GROUP BY
client,
imageType
ORDER BY
client DESC
imageType DESC;
52 changes: 52 additions & 0 deletions sql/2021/media/image_alt.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#standardSQL
# usage of alt text in images

# returns all the data we need from _markup
CREATE TEMPORARY FUNCTION get_markup_info(markup_string STRING)
RETURNS STRUCT<
total INT64,
alt_missing INT64,
alt_blank INT64,
alt_present INT64,
decode_lazy INT64

> LANGUAGE js AS '''
var result = {};
try {
var markup = JSON.parse(markup_string);

if (Array.isArray(markup) || typeof markup != 'object') return result;

result.total = markup.images.img.total;
result.alt_missing = markup.images.img.alt.missing;
result.alt_blank = markup.images.img.alt.blank;
result.alt_present = markup.images.img.alt.present;
result.decode_lazy = markup.images.img.decoding || 0;

} catch (e) {}
return result;
''';

SELECT
client,
SAFE_DIVIDE(COUNTIF(markup_info.total > 0), COUNT(0)) AS pages_with_img_pct,
SAFE_DIVIDE(COUNTIF(markup_info.alt_missing > 0), COUNT(0)) AS pages_with_alt_missing_pct,
SAFE_DIVIDE(COUNTIF(markup_info.alt_blank > 0), COUNT(0)) AS pages_with_alt_blank_pct,
SAFE_DIVIDE(COUNTIF(markup_info.alt_present > 0), COUNT(0)) AS pages_with_alt_present_pct,
SAFE_DIVIDE(COUNTIF(markup_info.decode_lazy > 0), COUNT(0)) as pages_with_decode_pct,
SUM(markup_info.total) AS img_total,
SAFE_DIVIDE(SUM(markup_info.alt_missing), SUM(markup_info.total)) AS imgs_alt_missing_pct,
SAFE_DIVIDE(SUM(markup_info.alt_blank), SUM(markup_info.total)) AS img_alt_blank_pct,
SAFE_DIVIDE(SUM(markup_info.alt_present), SUM(markup_info.total)) AS img_alt_present_pct,
SAFE_DIVIDE(SUM(markup_info.decode_lazy), SUM(markup_info.total)) AS img_decode_pct
FROM (
SELECT
_TABLE_SUFFIX AS client,
url,
get_markup_info(JSON_EXTRACT_SCALAR(payload, '$._markup')) AS markup_info
FROM
`httparchive.pages.2021_08_01_*`)
GROUP BY
client
ORDER BY
client
37 changes: 37 additions & 0 deletions sql/2021/media/image_dimension_by_device.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#standardSQL
# Taken from Performance 2020 chapter
# Removed the condition to look for only ECommerce

## $4.11 run

CREATE TEMPORARY FUNCTION getImageDimensions(payload STRING)
RETURNS ARRAY<STRUCT<height INT64, width INT64>> LANGUAGE js AS '''
try {
var $ = JSON.parse(payload);
var images = JSON.parse($._Images);
return images.map(i => ({height: i.naturalHeight, width: i.naturalWidth}));
} catch (e) {
return [];
}
''';

SELECT
percentile,
category,
_TABLE_SUFFIX AS client,
APPROX_QUANTILES(image.width, 1000)[OFFSET(percentile * 10)] AS image_width,
APPROX_QUANTILES(image.height, 1000)[OFFSET(percentile * 10)] AS image_height
FROM
`httparchive.pages.2021_08_01_*`
JOIN
`httparchive.technologies.2021_08_01_*`
USING (_TABLE_SUFFIX, url),
UNNEST(getImageDimensions(payload)) AS image,
UNNEST([10, 25, 50, 75, 90]) AS percentile
GROUP BY
percentile,
client,
category
ORDER BY
percentile,
client
25 changes: 25 additions & 0 deletions sql/2021/media/image_mimetype_ext.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#standardSQL
# images mimetype vs extension
SELECT
client,
ext,
mimetype,
COUNT(0) AS ext_mime_image_count,
SUM(COUNT(0)) OVER (PARTITION BY client) AS total_images,
SAFE_DIVIDE(COUNT(0), SUM(COUNT(0)) OVER (PARTITION BY client)) AS total_image_pct,
SAFE_DIVIDE(COUNT(0), SUM(COUNT(0)) OVER (PARTITION BY client, ext)) AS ext_pct,
SAFE_DIVIDE(COUNT(0), SUM(COUNT(0)) OVER (PARTITION BY client, mimetype)) AS mime_pct
FROM
`httparchive.almanac.requests`
WHERE
date = '2021-08-01' and
type = 'image'
GROUP BY
client,
ext,
mimetype
HAVING
ext_mime_image_count > 10000
ORDER BY
ext_mime_image_count DESC,
client
41 changes: 41 additions & 0 deletions sql/2021/media/image_srcset_candidates.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#standardSQL
# images srcset candidates average

# returns all the data we need from _media
CREATE TEMPORARY FUNCTION get_media_info(media_string STRING)
RETURNS STRUCT<
num_srcset_all INT64,
num_srcset_candidates_avg INT64
> LANGUAGE js AS '''
var result = {};
try {
var media = JSON.parse(media_string);

if (Array.isArray(media) || typeof media != 'object') return result;

result.num_srcset_all = media.num_srcset_all;
result.num_srcset_candidates_avg =
media.num_srcset_all == 0? 0: (media.num_srcset_candidates / media.num_srcset_all);

} catch (e) {}
return result;
''';

SELECT
client,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_all > 0), COUNT(0)) AS pages_with_srcset_pct,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_candidates_avg >= 1 AND media_info.num_srcset_candidates_avg <=3), COUNTIF(media_info.num_srcset_all > 0)) AS pages_with_srcset_candidates_1_3_pct,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_candidates_avg >= 1 AND media_info.num_srcset_candidates_avg <= 5), COUNTIF(media_info.num_srcset_all > 0)) AS pages_with_srcset_candidates_1_5_pct,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_candidates_avg > 5 AND media_info.num_srcset_candidates_avg <= 10), COUNTIF(media_info.num_srcset_all > 0)) AS pages_with_srcset_candidates_5_10_pct,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_candidates_avg > 10 AND media_info.num_srcset_candidates_avg <= 15), COUNTIF(media_info.num_srcset_all > 0)) AS pages_with_srcset_candidates_10_15_pct,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_candidates_avg > 15 AND media_info.num_srcset_candidates_avg <= 20), COUNTIF(media_info.num_srcset_all > 0)) AS pages_with_srcset_candidates_15_20_pct
FROM (
SELECT
_TABLE_SUFFIX AS client,
get_media_info(JSON_EXTRACT_SCALAR(payload, '$._media')) AS media_info
FROM
`httparchive.pages.2021_08_01_*`)
GROUP BY
client
ORDER BY
client
41 changes: 41 additions & 0 deletions sql/2021/media/image_srcset_descriptor.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#standardSQL
# images with srcset descriptor_x descriptor_w

# returns all the data we need from _media
CREATE TEMPORARY FUNCTION get_media_info(media_string STRING)
RETURNS STRUCT<
num_srcset_all INT64,
num_srcset_descriptor_x INT64,
num_srcset_descriptor_w INT64
> LANGUAGE js AS '''
var result = {};
try {
var media = JSON.parse(media_string);

if (Array.isArray(media) || typeof media != 'object') return result;

result.num_srcset_all = media.num_srcset_all;
result.num_srcset_descriptor_x = media.num_srcset_descriptor_x;
result.num_srcset_descriptor_w = media.num_srcset_descriptor_w;

} catch (e) {}
return result;
''';

SELECT
client,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_all > 0), COUNT(0)) AS pages_with_srcset_pct,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_descriptor_x > 0), COUNT(0)) AS pages_with_srcset_descriptor_x_pct,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_descriptor_w > 0), COUNT(0)) AS pages_with_srcset_descriptor_w_pct,
SAFE_DIVIDE(SUM(media_info.num_srcset_descriptor_x), SUM(media_info.num_srcset_all)) AS instances_of_srcset_descriptor_x_pct,
SAFE_DIVIDE(SUM(media_info.num_srcset_descriptor_w), SUM(media_info.num_srcset_all)) AS instances_of_srcset_descriptor_w_pct
FROM (
SELECT
_TABLE_SUFFIX AS client,
get_media_info(JSON_EXTRACT_SCALAR(payload, '$._media')) AS media_info
FROM
`httparchive.pages.2021_08_01_*`)
GROUP BY
client
ORDER BY
client
39 changes: 39 additions & 0 deletions sql/2021/media/image_srcset_sizes.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#standardSQL
# images with srcset w/wo sizes

# returns all the data we need from _media
CREATE TEMPORARY FUNCTION get_media_info(media_string STRING)
RETURNS STRUCT<
num_srcset_all INT64,
num_srcset_sizes INT64
> LANGUAGE js AS '''
var result = {};
try {
var media = JSON.parse(media_string);

if (Array.isArray(media) || typeof media != 'object') return result;

result.num_srcset_all = media.num_srcset_all;
result.num_srcset_sizes = media.num_srcset_sizes;

} catch (e) {}
return result;
''';

SELECT
client,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_all > 0), COUNT(0)) AS pages_with_srcset_pct,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_sizes > 0), COUNT(0)) AS pages_with_srcset_sizes_pct,
SAFE_DIVIDE((COUNTIF(media_info.num_srcset_all > 0) - COUNTIF(media_info.num_srcset_sizes > 0)), COUNT(0)) AS pages_with_srcset_wo_sizes_pct,
SAFE_DIVIDE(SUM(media_info.num_srcset_sizes), SUM(media_info.num_srcset_all)) AS instances_of_srcset_sizes_pct,
SAFE_DIVIDE((SUM(media_info.num_srcset_all) - SUM(media_info.num_srcset_sizes)), SUM(media_info.num_srcset_all)) AS instances_of_srcset_wo_sizes_pct
FROM (
SELECT
_TABLE_SUFFIX AS client,
get_media_info(JSON_EXTRACT_SCALAR(payload, '$._media')) AS media_info
FROM
`httparchive.pages.2021_08_01_*`)
GROUP BY
client
ORDER BY
client
22 changes: 22 additions & 0 deletions sql/2021/media/img_covid_in_path.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#standardSQL
# images with covid in path - average size
SELECT
client,
LOWER(ext) AS ext,
COUNT(0) AS ext_count,
COUNTIF(REGEXP_CONTAINS(LOWER(url), r'[^/]*?[:]//[^/]*?/.*?covid')) AS ext_count_covid,
SAFE_DIVIDE(SUM(respSize), COUNT(0)) AS avg_size,
SAFE_DIVIDE(SUM(IF(REGEXP_CONTAINS(LOWER(url), r'[^/]*?[:]//[^/]*?/.*?covid'), respSize, 0)), COUNTIF(REGEXP_CONTAINS(LOWER(url), r'[^/]*?[:]//[^/]*?/.*?covid'))) AS avg_size_covid
FROM
`httparchive.almanac.requests`
WHERE
date = '2021-08-01' AND
type = 'image'
GROUP BY
client,
ext
HAVING
ext_count_covid > 100
ORDER BY
client,
ext_count_covid DESC;
Loading