Skip to content

Commit

Permalink
Merge pull request #2 from akshay-ranganath/main
Browse files Browse the repository at this point in the history
Adding queries from past year
  • Loading branch information
eeeps authored Nov 1, 2021
2 parents 0ad2835 + 9db8161 commit 2fe4cfc
Show file tree
Hide file tree
Showing 24 changed files with 1,022 additions and 0 deletions.
37 changes: 37 additions & 0 deletions sql/2021/media/big_non_custom_metrics.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#standardSQL
# non custom metrics sql that uses regexp on response bodies
# img src vs data-uri
# count rel=preconnect
# video with src
# video with source
# figure
# figure with figcaption

SELECT
client,
COUNTIF(has_img_data_uri) / COUNTIF(has_img_src) AS pages_with_img_data_uri_pct,
COUNTIF(rel_preconnect) / COUNT(0) AS pages_with_rel_preconnect_pct,
COUNTIF(has_video_src) / COUNT(0) AS pages_with_video_src_pct,
COUNTIF(has_video_source) / COUNT(0) AS pages_with_video_source_pct,
COUNTIF(has_figure) / COUNT(0) AS pages_with_figure_pct,
COUNTIF(has_figcaption) / COUNT(0) AS pages_with_figcaption_pct
FROM (
SELECT
client,
page,
REGEXP_CONTAINS(body, r'(?i)<img[^><]*src=(?:\"|\')*data[:]image/(?:\"|\')*[^><]*>') AS has_img_data_uri,
REGEXP_CONTAINS(body, r'(?i)<img[^><]*src=[^><]*>') AS has_img_src,
REGEXP_CONTAINS(body, r'(?i)<link[^><]*rel=(?:\"|\')*preconnect/(?:\"|\')*[^><]*>') AS rel_preconnect,
REGEXP_CONTAINS(body, r'(?i)<video[^><]*src=[^><]*>') AS has_video_src,
REGEXP_CONTAINS(body, r'(?i)<video[^><]*>.*?<source[^><]*>.*?</video>') AS has_video_source,
REGEXP_CONTAINS(body, r'(?i)<figure[^><]*>') AS has_figure,
REGEXP_CONTAINS(body, r'(?i)<figure[^><]*>.*?<figcaption[^><]*>.*?</figure>') AS has_figcaption
FROM
`httparchive.almanac.summary_response_bodies`
WHERE
date = '2021-08-01' AND
firstHtml)
GROUP BY
client
ORDER BY
client
89 changes: 89 additions & 0 deletions sql/2021/media/bytes_per_pixel.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#standardSQL
# Bytes per pixel per image format
CREATE TEMPORARY FUNCTION getImages(payload STRING)
RETURNS ARRAY<STRUCT<url STRING, naturalWidth INT64, naturalHeight INT64, width INT64, height INT64>> LANGUAGE js AS '''
try {
var $ = JSON.parse(payload);
var images = JSON.parse($._Images) || [];
return images.map(({url, naturalHeight, naturalWidth, width, height}) => ({url, naturalHeight: Number.parseInt(naturalHeight) || 0, naturalWidth: Number.parseInt(naturalWidth) || 0, width: Number.parseInt(width) || 0, height: Number.parseInt(height) || 0}));
} catch (e) {}
return null;
''';

SELECT
a.client,
imageType,
count(0) AS count,
APPROX_QUANTILES(if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 1000)[OFFSET(100)] AS pixels_p10,
APPROX_QUANTILES(if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 1000)[OFFSET(250)] AS pixels_p25,
APPROX_QUANTILES(if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 1000)[OFFSET(500)] AS pixels_p50,
APPROX_QUANTILES(if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 1000)[OFFSET(750)] AS pixels_p75,
APPROX_QUANTILES(if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 1000)[OFFSET(900)] AS pixels_p90,
APPROX_QUANTILES(bytes, 1000)[OFFSET(100)] AS bytes_p10,
APPROX_QUANTILES(bytes, 1000)[OFFSET(250)] AS bytes_p25,
APPROX_QUANTILES(bytes, 1000)[OFFSET(500)] AS bytes_p50,
APPROX_QUANTILES(bytes, 1000)[OFFSET(750)] AS bytes_p75,
APPROX_QUANTILES(bytes, 1000)[OFFSET(900)] AS bytes_p90,
APPROX_QUANTILES(round(bytes/if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 4), 1000)[OFFSET(100)] AS bpp_p10,
APPROX_QUANTILES(round(bytes/if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 4), 1000)[OFFSET(250)] AS bpp_p25,
APPROX_QUANTILES(round(bytes/if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 4), 1000)[OFFSET(500)] AS bpp_p50,
APPROX_QUANTILES(round(bytes/if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 4), 1000)[OFFSET(750)] AS bpp_p75,
APPROX_QUANTILES(round(bytes/if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 4), 1000)[OFFSET(900)] AS bpp_p90
FROM
(
SELECT
_TABLE_SUFFIX AS client,
p.url AS page,
image.url AS url,
image.width AS width,
image.height AS height,
image.naturalWidth AS naturalWidth,
image.naturalHeight AS naturalHeight,
IFNULL(image.width, 0) * IFNULL(image.height, 0) AS pixels,
IFNULL(image.naturalWidth, 0) * IFNULL(image.naturalHeight, 0) AS naturalPixels
FROM
`httparchive.pages.2021_08_01_*` p
CROSS JOIN UNNEST(getImages(payload)) AS image
WHERE
image.naturalHeight > 0 AND
image.naturalWidth > 0
) a
LEFT JOIN
(
SELECT
client,
page,
url,
NULLIF(if(REGEXP_CONTAINS(mimetype, r'(?i)^application|^applicaton|^binary|^image$|^multipart|^media|^$|^text/html|^text/plain|\d|array|unknown|undefined|\*|string|^img|^images|^text|\%2f|\(|ipg$|jpe$|jfif'), format, LOWER(regexp_replace(regexp_replace(mimetype, r'(?is).*image[/\\](?:x-)?|[\."]|[ +,;]+.*$', ''), r'(?i)pjpeg|jpeg', 'jpg'))), '') AS imageType,
respSize AS bytes
FROM `httparchive.almanac.requests`

WHERE
# many 404s and redirects show up as image/gif
status = 200 AND

# we are trying to catch images. WPO populates the format for media but it uses a file extension guess.
#So we exclude mimetypes that aren't image or where the format couldn't be guessed by WPO
(format <> '' OR mimetype LIKE 'image%') AND

# many image/gifs are really beacons with 1x1 pixel, but svgs can get caught in the mix
(respSize > 1500 OR REGEXP_CONTAINS(mimetype, r'svg')) AND

# strip favicon requests
format <> 'ico' AND

# strip video mimetypes and other favicons
NOT REGEXP_CONTAINS(mimetype, r'video|ico')
) b
ON (b.client = a.client AND a.page = b.page AND a.url = b.url)

WHERE
naturalPixels > 0 AND
bytes > 0 AND
imageType IN ('jpg', 'png', 'webp', 'gif', 'svg')
GROUP BY
client,
imageType
ORDER BY
client DESC
imageType DESC;
52 changes: 52 additions & 0 deletions sql/2021/media/image_alt.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#standardSQL
# usage of alt text in images

# returns all the data we need from _markup
CREATE TEMPORARY FUNCTION get_markup_info(markup_string STRING)
RETURNS STRUCT<
total INT64,
alt_missing INT64,
alt_blank INT64,
alt_present INT64,
decode_lazy INT64

> LANGUAGE js AS '''
var result = {};
try {
var markup = JSON.parse(markup_string);
if (Array.isArray(markup) || typeof markup != 'object') return result;
result.total = markup.images.img.total;
result.alt_missing = markup.images.img.alt.missing;
result.alt_blank = markup.images.img.alt.blank;
result.alt_present = markup.images.img.alt.present;
result.decode_lazy = markup.images.img.decoding || 0;
} catch (e) {}
return result;
''';

SELECT
client,
SAFE_DIVIDE(COUNTIF(markup_info.total > 0), COUNT(0)) AS pages_with_img_pct,
SAFE_DIVIDE(COUNTIF(markup_info.alt_missing > 0), COUNT(0)) AS pages_with_alt_missing_pct,
SAFE_DIVIDE(COUNTIF(markup_info.alt_blank > 0), COUNT(0)) AS pages_with_alt_blank_pct,
SAFE_DIVIDE(COUNTIF(markup_info.alt_present > 0), COUNT(0)) AS pages_with_alt_present_pct,
SAFE_DIVIDE(COUNTIF(markup_info.decode_lazy > 0), COUNT(0)) as pages_with_decode_pct,
SUM(markup_info.total) AS img_total,
SAFE_DIVIDE(SUM(markup_info.alt_missing), SUM(markup_info.total)) AS imgs_alt_missing_pct,
SAFE_DIVIDE(SUM(markup_info.alt_blank), SUM(markup_info.total)) AS img_alt_blank_pct,
SAFE_DIVIDE(SUM(markup_info.alt_present), SUM(markup_info.total)) AS img_alt_present_pct,
SAFE_DIVIDE(SUM(markup_info.decode_lazy), SUM(markup_info.total)) AS img_decode_pct
FROM (
SELECT
_TABLE_SUFFIX AS client,
url,
get_markup_info(JSON_EXTRACT_SCALAR(payload, '$._markup')) AS markup_info
FROM
`httparchive.pages.2021_08_01_*`)
GROUP BY
client
ORDER BY
client
37 changes: 37 additions & 0 deletions sql/2021/media/image_dimension_by_device.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#standardSQL
# Taken from Performance 2020 chapter
# Removed the condition to look for only ECommerce

## $4.11 run

CREATE TEMPORARY FUNCTION getImageDimensions(payload STRING)
RETURNS ARRAY<STRUCT<height INT64, width INT64>> LANGUAGE js AS '''
try {
var $ = JSON.parse(payload);
var images = JSON.parse($._Images);
return images.map(i => ({height: i.naturalHeight, width: i.naturalWidth}));
} catch (e) {
return [];
}
''';

SELECT
percentile,
category,
_TABLE_SUFFIX AS client,
APPROX_QUANTILES(image.width, 1000)[OFFSET(percentile * 10)] AS image_width,
APPROX_QUANTILES(image.height, 1000)[OFFSET(percentile * 10)] AS image_height
FROM
`httparchive.pages.2021_08_01_*`
JOIN
`httparchive.technologies.2021_08_01_*`
USING (_TABLE_SUFFIX, url),
UNNEST(getImageDimensions(payload)) AS image,
UNNEST([10, 25, 50, 75, 90]) AS percentile
GROUP BY
percentile,
client,
category
ORDER BY
percentile,
client
25 changes: 25 additions & 0 deletions sql/2021/media/image_mimetype_ext.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#standardSQL
# images mimetype vs extension
SELECT
client,
ext,
mimetype,
COUNT(0) AS ext_mime_image_count,
SUM(COUNT(0)) OVER (PARTITION BY client) AS total_images,
SAFE_DIVIDE(COUNT(0), SUM(COUNT(0)) OVER (PARTITION BY client)) AS total_image_pct,
SAFE_DIVIDE(COUNT(0), SUM(COUNT(0)) OVER (PARTITION BY client, ext)) AS ext_pct,
SAFE_DIVIDE(COUNT(0), SUM(COUNT(0)) OVER (PARTITION BY client, mimetype)) AS mime_pct
FROM
`httparchive.almanac.requests`
WHERE
date = '2021-08-01' and
type = 'image'
GROUP BY
client,
ext,
mimetype
HAVING
ext_mime_image_count > 10000
ORDER BY
ext_mime_image_count DESC,
client
41 changes: 41 additions & 0 deletions sql/2021/media/image_srcset_candidates.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#standardSQL
# images srcset candidates average

# returns all the data we need from _media
CREATE TEMPORARY FUNCTION get_media_info(media_string STRING)
RETURNS STRUCT<
num_srcset_all INT64,
num_srcset_candidates_avg INT64
> LANGUAGE js AS '''
var result = {};
try {
var media = JSON.parse(media_string);
if (Array.isArray(media) || typeof media != 'object') return result;
result.num_srcset_all = media.num_srcset_all;
result.num_srcset_candidates_avg =
media.num_srcset_all == 0? 0: (media.num_srcset_candidates / media.num_srcset_all);
} catch (e) {}
return result;
''';

SELECT
client,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_all > 0), COUNT(0)) AS pages_with_srcset_pct,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_candidates_avg >= 1 AND media_info.num_srcset_candidates_avg <=3), COUNTIF(media_info.num_srcset_all > 0)) AS pages_with_srcset_candidates_1_3_pct,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_candidates_avg >= 1 AND media_info.num_srcset_candidates_avg <= 5), COUNTIF(media_info.num_srcset_all > 0)) AS pages_with_srcset_candidates_1_5_pct,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_candidates_avg > 5 AND media_info.num_srcset_candidates_avg <= 10), COUNTIF(media_info.num_srcset_all > 0)) AS pages_with_srcset_candidates_5_10_pct,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_candidates_avg > 10 AND media_info.num_srcset_candidates_avg <= 15), COUNTIF(media_info.num_srcset_all > 0)) AS pages_with_srcset_candidates_10_15_pct,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_candidates_avg > 15 AND media_info.num_srcset_candidates_avg <= 20), COUNTIF(media_info.num_srcset_all > 0)) AS pages_with_srcset_candidates_15_20_pct
FROM (
SELECT
_TABLE_SUFFIX AS client,
get_media_info(JSON_EXTRACT_SCALAR(payload, '$._media')) AS media_info
FROM
`httparchive.pages.2021_08_01_*`)
GROUP BY
client
ORDER BY
client
41 changes: 41 additions & 0 deletions sql/2021/media/image_srcset_descriptor.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#standardSQL
# images with srcset descriptor_x descriptor_w

# returns all the data we need from _media
CREATE TEMPORARY FUNCTION get_media_info(media_string STRING)
RETURNS STRUCT<
num_srcset_all INT64,
num_srcset_descriptor_x INT64,
num_srcset_descriptor_w INT64
> LANGUAGE js AS '''
var result = {};
try {
var media = JSON.parse(media_string);
if (Array.isArray(media) || typeof media != 'object') return result;
result.num_srcset_all = media.num_srcset_all;
result.num_srcset_descriptor_x = media.num_srcset_descriptor_x;
result.num_srcset_descriptor_w = media.num_srcset_descriptor_w;
} catch (e) {}
return result;
''';

SELECT
client,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_all > 0), COUNT(0)) AS pages_with_srcset_pct,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_descriptor_x > 0), COUNT(0)) AS pages_with_srcset_descriptor_x_pct,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_descriptor_w > 0), COUNT(0)) AS pages_with_srcset_descriptor_w_pct,
SAFE_DIVIDE(SUM(media_info.num_srcset_descriptor_x), SUM(media_info.num_srcset_all)) AS instances_of_srcset_descriptor_x_pct,
SAFE_DIVIDE(SUM(media_info.num_srcset_descriptor_w), SUM(media_info.num_srcset_all)) AS instances_of_srcset_descriptor_w_pct
FROM (
SELECT
_TABLE_SUFFIX AS client,
get_media_info(JSON_EXTRACT_SCALAR(payload, '$._media')) AS media_info
FROM
`httparchive.pages.2021_08_01_*`)
GROUP BY
client
ORDER BY
client
39 changes: 39 additions & 0 deletions sql/2021/media/image_srcset_sizes.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#standardSQL
# images with srcset w/wo sizes

# returns all the data we need from _media
CREATE TEMPORARY FUNCTION get_media_info(media_string STRING)
RETURNS STRUCT<
num_srcset_all INT64,
num_srcset_sizes INT64
> LANGUAGE js AS '''
var result = {};
try {
var media = JSON.parse(media_string);
if (Array.isArray(media) || typeof media != 'object') return result;
result.num_srcset_all = media.num_srcset_all;
result.num_srcset_sizes = media.num_srcset_sizes;
} catch (e) {}
return result;
''';

SELECT
client,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_all > 0), COUNT(0)) AS pages_with_srcset_pct,
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_sizes > 0), COUNT(0)) AS pages_with_srcset_sizes_pct,
SAFE_DIVIDE((COUNTIF(media_info.num_srcset_all > 0) - COUNTIF(media_info.num_srcset_sizes > 0)), COUNT(0)) AS pages_with_srcset_wo_sizes_pct,
SAFE_DIVIDE(SUM(media_info.num_srcset_sizes), SUM(media_info.num_srcset_all)) AS instances_of_srcset_sizes_pct,
SAFE_DIVIDE((SUM(media_info.num_srcset_all) - SUM(media_info.num_srcset_sizes)), SUM(media_info.num_srcset_all)) AS instances_of_srcset_wo_sizes_pct
FROM (
SELECT
_TABLE_SUFFIX AS client,
get_media_info(JSON_EXTRACT_SCALAR(payload, '$._media')) AS media_info
FROM
`httparchive.pages.2021_08_01_*`)
GROUP BY
client
ORDER BY
client
22 changes: 22 additions & 0 deletions sql/2021/media/img_covid_in_path.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#standardSQL
# images with covid in path - average size
SELECT
client,
LOWER(ext) AS ext,
COUNT(0) AS ext_count,
COUNTIF(REGEXP_CONTAINS(LOWER(url), r'[^/]*?[:]//[^/]*?/.*?covid')) AS ext_count_covid,
SAFE_DIVIDE(SUM(respSize), COUNT(0)) AS avg_size,
SAFE_DIVIDE(SUM(IF(REGEXP_CONTAINS(LOWER(url), r'[^/]*?[:]//[^/]*?/.*?covid'), respSize, 0)), COUNTIF(REGEXP_CONTAINS(LOWER(url), r'[^/]*?[:]//[^/]*?/.*?covid'))) AS avg_size_covid
FROM
`httparchive.almanac.requests`
WHERE
date = '2021-08-01' AND
type = 'image'
GROUP BY
client,
ext
HAVING
ext_count_covid > 100
ORDER BY
client,
ext_count_covid DESC;
Loading

0 comments on commit 2fe4cfc

Please sign in to comment.