forked from HTTPArchive/almanac.httparchive.org
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from akshay-ranganath/main
Adding queries from past year
- Loading branch information
Showing
24 changed files
with
1,022 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#standardSQL | ||
# non custom metrics sql that uses regexp on response bodies | ||
# img src vs data-uri | ||
# count rel=preconnect | ||
# video with src | ||
# video with source | ||
# figure | ||
# figure with figcaption | ||
|
||
SELECT | ||
client, | ||
COUNTIF(has_img_data_uri) / COUNTIF(has_img_src) AS pages_with_img_data_uri_pct, | ||
COUNTIF(rel_preconnect) / COUNT(0) AS pages_with_rel_preconnect_pct, | ||
COUNTIF(has_video_src) / COUNT(0) AS pages_with_video_src_pct, | ||
COUNTIF(has_video_source) / COUNT(0) AS pages_with_video_source_pct, | ||
COUNTIF(has_figure) / COUNT(0) AS pages_with_figure_pct, | ||
COUNTIF(has_figcaption) / COUNT(0) AS pages_with_figcaption_pct | ||
FROM ( | ||
SELECT | ||
client, | ||
page, | ||
REGEXP_CONTAINS(body, r'(?i)<img[^><]*src=(?:\"|\')*data[:]image/(?:\"|\')*[^><]*>') AS has_img_data_uri, | ||
REGEXP_CONTAINS(body, r'(?i)<img[^><]*src=[^><]*>') AS has_img_src, | ||
REGEXP_CONTAINS(body, r'(?i)<link[^><]*rel=(?:\"|\')*preconnect/(?:\"|\')*[^><]*>') AS rel_preconnect, | ||
REGEXP_CONTAINS(body, r'(?i)<video[^><]*src=[^><]*>') AS has_video_src, | ||
REGEXP_CONTAINS(body, r'(?i)<video[^><]*>.*?<source[^><]*>.*?</video>') AS has_video_source, | ||
REGEXP_CONTAINS(body, r'(?i)<figure[^><]*>') AS has_figure, | ||
REGEXP_CONTAINS(body, r'(?i)<figure[^><]*>.*?<figcaption[^><]*>.*?</figure>') AS has_figcaption | ||
FROM | ||
`httparchive.almanac.summary_response_bodies` | ||
WHERE | ||
date = '2021-08-01' AND | ||
firstHtml) | ||
GROUP BY | ||
client | ||
ORDER BY | ||
client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
#standardSQL | ||
# Bytes per pixel per image format | ||
CREATE TEMPORARY FUNCTION getImages(payload STRING) | ||
RETURNS ARRAY<STRUCT<url STRING, naturalWidth INT64, naturalHeight INT64, width INT64, height INT64>> LANGUAGE js AS ''' | ||
try { | ||
var $ = JSON.parse(payload); | ||
var images = JSON.parse($._Images) || []; | ||
return images.map(({url, naturalHeight, naturalWidth, width, height}) => ({url, naturalHeight: Number.parseInt(naturalHeight) || 0, naturalWidth: Number.parseInt(naturalWidth) || 0, width: Number.parseInt(width) || 0, height: Number.parseInt(height) || 0})); | ||
} catch (e) {} | ||
return null; | ||
'''; | ||
|
||
SELECT | ||
a.client, | ||
imageType, | ||
count(0) AS count, | ||
APPROX_QUANTILES(if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 1000)[OFFSET(100)] AS pixels_p10, | ||
APPROX_QUANTILES(if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 1000)[OFFSET(250)] AS pixels_p25, | ||
APPROX_QUANTILES(if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 1000)[OFFSET(500)] AS pixels_p50, | ||
APPROX_QUANTILES(if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 1000)[OFFSET(750)] AS pixels_p75, | ||
APPROX_QUANTILES(if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 1000)[OFFSET(900)] AS pixels_p90, | ||
APPROX_QUANTILES(bytes, 1000)[OFFSET(100)] AS bytes_p10, | ||
APPROX_QUANTILES(bytes, 1000)[OFFSET(250)] AS bytes_p25, | ||
APPROX_QUANTILES(bytes, 1000)[OFFSET(500)] AS bytes_p50, | ||
APPROX_QUANTILES(bytes, 1000)[OFFSET(750)] AS bytes_p75, | ||
APPROX_QUANTILES(bytes, 1000)[OFFSET(900)] AS bytes_p90, | ||
APPROX_QUANTILES(round(bytes/if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 4), 1000)[OFFSET(100)] AS bpp_p10, | ||
APPROX_QUANTILES(round(bytes/if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 4), 1000)[OFFSET(250)] AS bpp_p25, | ||
APPROX_QUANTILES(round(bytes/if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 4), 1000)[OFFSET(500)] AS bpp_p50, | ||
APPROX_QUANTILES(round(bytes/if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 4), 1000)[OFFSET(750)] AS bpp_p75, | ||
APPROX_QUANTILES(round(bytes/if(imageType = 'svg' AND pixels > 0, pixels, naturalPixels), 4), 1000)[OFFSET(900)] AS bpp_p90 | ||
FROM | ||
( | ||
SELECT | ||
_TABLE_SUFFIX AS client, | ||
p.url AS page, | ||
image.url AS url, | ||
image.width AS width, | ||
image.height AS height, | ||
image.naturalWidth AS naturalWidth, | ||
image.naturalHeight AS naturalHeight, | ||
IFNULL(image.width, 0) * IFNULL(image.height, 0) AS pixels, | ||
IFNULL(image.naturalWidth, 0) * IFNULL(image.naturalHeight, 0) AS naturalPixels | ||
FROM | ||
`httparchive.pages.2021_08_01_*` p | ||
CROSS JOIN UNNEST(getImages(payload)) AS image | ||
WHERE | ||
image.naturalHeight > 0 AND | ||
image.naturalWidth > 0 | ||
) a | ||
LEFT JOIN | ||
( | ||
SELECT | ||
client, | ||
page, | ||
url, | ||
NULLIF(if(REGEXP_CONTAINS(mimetype, r'(?i)^application|^applicaton|^binary|^image$|^multipart|^media|^$|^text/html|^text/plain|\d|array|unknown|undefined|\*|string|^img|^images|^text|\%2f|\(|ipg$|jpe$|jfif'), format, LOWER(regexp_replace(regexp_replace(mimetype, r'(?is).*image[/\\](?:x-)?|[\."]|[ +,;]+.*$', ''), r'(?i)pjpeg|jpeg', 'jpg'))), '') AS imageType, | ||
respSize AS bytes | ||
FROM `httparchive.almanac.requests` | ||
|
||
WHERE | ||
# many 404s and redirects show up as image/gif | ||
status = 200 AND | ||
|
||
# we are trying to catch images. WPO populates the format for media but it uses a file extension guess. | ||
#So we exclude mimetypes that aren't image or where the format couldn't be guessed by WPO | ||
(format <> '' OR mimetype LIKE 'image%') AND | ||
|
||
# many image/gifs are really beacons with 1x1 pixel, but svgs can get caught in the mix | ||
(respSize > 1500 OR REGEXP_CONTAINS(mimetype, r'svg')) AND | ||
|
||
# strip favicon requests | ||
format <> 'ico' AND | ||
|
||
# strip video mimetypes and other favicons | ||
NOT REGEXP_CONTAINS(mimetype, r'video|ico') | ||
) b | ||
ON (b.client = a.client AND a.page = b.page AND a.url = b.url) | ||
|
||
WHERE | ||
naturalPixels > 0 AND | ||
bytes > 0 AND | ||
imageType IN ('jpg', 'png', 'webp', 'gif', 'svg') | ||
GROUP BY | ||
client, | ||
imageType | ||
ORDER BY | ||
client DESC | ||
imageType DESC; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#standardSQL | ||
# usage of alt text in images | ||
|
||
# returns all the data we need from _markup | ||
CREATE TEMPORARY FUNCTION get_markup_info(markup_string STRING) | ||
RETURNS STRUCT< | ||
total INT64, | ||
alt_missing INT64, | ||
alt_blank INT64, | ||
alt_present INT64, | ||
decode_lazy INT64 | ||
|
||
> LANGUAGE js AS ''' | ||
var result = {}; | ||
try { | ||
var markup = JSON.parse(markup_string); | ||
if (Array.isArray(markup) || typeof markup != 'object') return result; | ||
result.total = markup.images.img.total; | ||
result.alt_missing = markup.images.img.alt.missing; | ||
result.alt_blank = markup.images.img.alt.blank; | ||
result.alt_present = markup.images.img.alt.present; | ||
result.decode_lazy = markup.images.img.decoding || 0; | ||
} catch (e) {} | ||
return result; | ||
'''; | ||
|
||
SELECT | ||
client, | ||
SAFE_DIVIDE(COUNTIF(markup_info.total > 0), COUNT(0)) AS pages_with_img_pct, | ||
SAFE_DIVIDE(COUNTIF(markup_info.alt_missing > 0), COUNT(0)) AS pages_with_alt_missing_pct, | ||
SAFE_DIVIDE(COUNTIF(markup_info.alt_blank > 0), COUNT(0)) AS pages_with_alt_blank_pct, | ||
SAFE_DIVIDE(COUNTIF(markup_info.alt_present > 0), COUNT(0)) AS pages_with_alt_present_pct, | ||
SAFE_DIVIDE(COUNTIF(markup_info.decode_lazy > 0), COUNT(0)) as pages_with_decode_pct, | ||
SUM(markup_info.total) AS img_total, | ||
SAFE_DIVIDE(SUM(markup_info.alt_missing), SUM(markup_info.total)) AS imgs_alt_missing_pct, | ||
SAFE_DIVIDE(SUM(markup_info.alt_blank), SUM(markup_info.total)) AS img_alt_blank_pct, | ||
SAFE_DIVIDE(SUM(markup_info.alt_present), SUM(markup_info.total)) AS img_alt_present_pct, | ||
SAFE_DIVIDE(SUM(markup_info.decode_lazy), SUM(markup_info.total)) AS img_decode_pct | ||
FROM ( | ||
SELECT | ||
_TABLE_SUFFIX AS client, | ||
url, | ||
get_markup_info(JSON_EXTRACT_SCALAR(payload, '$._markup')) AS markup_info | ||
FROM | ||
`httparchive.pages.2021_08_01_*`) | ||
GROUP BY | ||
client | ||
ORDER BY | ||
client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#standardSQL | ||
# Taken from Performance 2020 chapter | ||
# Removed the condition to look for only ECommerce | ||
|
||
## $4.11 run | ||
|
||
CREATE TEMPORARY FUNCTION getImageDimensions(payload STRING) | ||
RETURNS ARRAY<STRUCT<height INT64, width INT64>> LANGUAGE js AS ''' | ||
try { | ||
var $ = JSON.parse(payload); | ||
var images = JSON.parse($._Images); | ||
return images.map(i => ({height: i.naturalHeight, width: i.naturalWidth})); | ||
} catch (e) { | ||
return []; | ||
} | ||
'''; | ||
|
||
SELECT | ||
percentile, | ||
category, | ||
_TABLE_SUFFIX AS client, | ||
APPROX_QUANTILES(image.width, 1000)[OFFSET(percentile * 10)] AS image_width, | ||
APPROX_QUANTILES(image.height, 1000)[OFFSET(percentile * 10)] AS image_height | ||
FROM | ||
`httparchive.pages.2021_08_01_*` | ||
JOIN | ||
`httparchive.technologies.2021_08_01_*` | ||
USING (_TABLE_SUFFIX, url), | ||
UNNEST(getImageDimensions(payload)) AS image, | ||
UNNEST([10, 25, 50, 75, 90]) AS percentile | ||
GROUP BY | ||
percentile, | ||
client, | ||
category | ||
ORDER BY | ||
percentile, | ||
client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
#standardSQL | ||
# images mimetype vs extension | ||
SELECT | ||
client, | ||
ext, | ||
mimetype, | ||
COUNT(0) AS ext_mime_image_count, | ||
SUM(COUNT(0)) OVER (PARTITION BY client) AS total_images, | ||
SAFE_DIVIDE(COUNT(0), SUM(COUNT(0)) OVER (PARTITION BY client)) AS total_image_pct, | ||
SAFE_DIVIDE(COUNT(0), SUM(COUNT(0)) OVER (PARTITION BY client, ext)) AS ext_pct, | ||
SAFE_DIVIDE(COUNT(0), SUM(COUNT(0)) OVER (PARTITION BY client, mimetype)) AS mime_pct | ||
FROM | ||
`httparchive.almanac.requests` | ||
WHERE | ||
date = '2021-08-01' and | ||
type = 'image' | ||
GROUP BY | ||
client, | ||
ext, | ||
mimetype | ||
HAVING | ||
ext_mime_image_count > 10000 | ||
ORDER BY | ||
ext_mime_image_count DESC, | ||
client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#standardSQL | ||
# images srcset candidates average | ||
|
||
# returns all the data we need from _media | ||
CREATE TEMPORARY FUNCTION get_media_info(media_string STRING) | ||
RETURNS STRUCT< | ||
num_srcset_all INT64, | ||
num_srcset_candidates_avg INT64 | ||
> LANGUAGE js AS ''' | ||
var result = {}; | ||
try { | ||
var media = JSON.parse(media_string); | ||
if (Array.isArray(media) || typeof media != 'object') return result; | ||
result.num_srcset_all = media.num_srcset_all; | ||
result.num_srcset_candidates_avg = | ||
media.num_srcset_all == 0? 0: (media.num_srcset_candidates / media.num_srcset_all); | ||
} catch (e) {} | ||
return result; | ||
'''; | ||
|
||
SELECT | ||
client, | ||
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_all > 0), COUNT(0)) AS pages_with_srcset_pct, | ||
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_candidates_avg >= 1 AND media_info.num_srcset_candidates_avg <=3), COUNTIF(media_info.num_srcset_all > 0)) AS pages_with_srcset_candidates_1_3_pct, | ||
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_candidates_avg >= 1 AND media_info.num_srcset_candidates_avg <= 5), COUNTIF(media_info.num_srcset_all > 0)) AS pages_with_srcset_candidates_1_5_pct, | ||
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_candidates_avg > 5 AND media_info.num_srcset_candidates_avg <= 10), COUNTIF(media_info.num_srcset_all > 0)) AS pages_with_srcset_candidates_5_10_pct, | ||
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_candidates_avg > 10 AND media_info.num_srcset_candidates_avg <= 15), COUNTIF(media_info.num_srcset_all > 0)) AS pages_with_srcset_candidates_10_15_pct, | ||
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_candidates_avg > 15 AND media_info.num_srcset_candidates_avg <= 20), COUNTIF(media_info.num_srcset_all > 0)) AS pages_with_srcset_candidates_15_20_pct | ||
FROM ( | ||
SELECT | ||
_TABLE_SUFFIX AS client, | ||
get_media_info(JSON_EXTRACT_SCALAR(payload, '$._media')) AS media_info | ||
FROM | ||
`httparchive.pages.2021_08_01_*`) | ||
GROUP BY | ||
client | ||
ORDER BY | ||
client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#standardSQL | ||
# images with srcset descriptor_x descriptor_w | ||
|
||
# returns all the data we need from _media | ||
CREATE TEMPORARY FUNCTION get_media_info(media_string STRING) | ||
RETURNS STRUCT< | ||
num_srcset_all INT64, | ||
num_srcset_descriptor_x INT64, | ||
num_srcset_descriptor_w INT64 | ||
> LANGUAGE js AS ''' | ||
var result = {}; | ||
try { | ||
var media = JSON.parse(media_string); | ||
if (Array.isArray(media) || typeof media != 'object') return result; | ||
result.num_srcset_all = media.num_srcset_all; | ||
result.num_srcset_descriptor_x = media.num_srcset_descriptor_x; | ||
result.num_srcset_descriptor_w = media.num_srcset_descriptor_w; | ||
} catch (e) {} | ||
return result; | ||
'''; | ||
|
||
SELECT | ||
client, | ||
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_all > 0), COUNT(0)) AS pages_with_srcset_pct, | ||
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_descriptor_x > 0), COUNT(0)) AS pages_with_srcset_descriptor_x_pct, | ||
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_descriptor_w > 0), COUNT(0)) AS pages_with_srcset_descriptor_w_pct, | ||
SAFE_DIVIDE(SUM(media_info.num_srcset_descriptor_x), SUM(media_info.num_srcset_all)) AS instances_of_srcset_descriptor_x_pct, | ||
SAFE_DIVIDE(SUM(media_info.num_srcset_descriptor_w), SUM(media_info.num_srcset_all)) AS instances_of_srcset_descriptor_w_pct | ||
FROM ( | ||
SELECT | ||
_TABLE_SUFFIX AS client, | ||
get_media_info(JSON_EXTRACT_SCALAR(payload, '$._media')) AS media_info | ||
FROM | ||
`httparchive.pages.2021_08_01_*`) | ||
GROUP BY | ||
client | ||
ORDER BY | ||
client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#standardSQL | ||
# images with srcset w/wo sizes | ||
|
||
# returns all the data we need from _media | ||
CREATE TEMPORARY FUNCTION get_media_info(media_string STRING) | ||
RETURNS STRUCT< | ||
num_srcset_all INT64, | ||
num_srcset_sizes INT64 | ||
> LANGUAGE js AS ''' | ||
var result = {}; | ||
try { | ||
var media = JSON.parse(media_string); | ||
if (Array.isArray(media) || typeof media != 'object') return result; | ||
result.num_srcset_all = media.num_srcset_all; | ||
result.num_srcset_sizes = media.num_srcset_sizes; | ||
} catch (e) {} | ||
return result; | ||
'''; | ||
|
||
SELECT | ||
client, | ||
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_all > 0), COUNT(0)) AS pages_with_srcset_pct, | ||
SAFE_DIVIDE(COUNTIF(media_info.num_srcset_sizes > 0), COUNT(0)) AS pages_with_srcset_sizes_pct, | ||
SAFE_DIVIDE((COUNTIF(media_info.num_srcset_all > 0) - COUNTIF(media_info.num_srcset_sizes > 0)), COUNT(0)) AS pages_with_srcset_wo_sizes_pct, | ||
SAFE_DIVIDE(SUM(media_info.num_srcset_sizes), SUM(media_info.num_srcset_all)) AS instances_of_srcset_sizes_pct, | ||
SAFE_DIVIDE((SUM(media_info.num_srcset_all) - SUM(media_info.num_srcset_sizes)), SUM(media_info.num_srcset_all)) AS instances_of_srcset_wo_sizes_pct | ||
FROM ( | ||
SELECT | ||
_TABLE_SUFFIX AS client, | ||
get_media_info(JSON_EXTRACT_SCALAR(payload, '$._media')) AS media_info | ||
FROM | ||
`httparchive.pages.2021_08_01_*`) | ||
GROUP BY | ||
client | ||
ORDER BY | ||
client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#standardSQL | ||
# images with covid in path - average size | ||
SELECT | ||
client, | ||
LOWER(ext) AS ext, | ||
COUNT(0) AS ext_count, | ||
COUNTIF(REGEXP_CONTAINS(LOWER(url), r'[^/]*?[:]//[^/]*?/.*?covid')) AS ext_count_covid, | ||
SAFE_DIVIDE(SUM(respSize), COUNT(0)) AS avg_size, | ||
SAFE_DIVIDE(SUM(IF(REGEXP_CONTAINS(LOWER(url), r'[^/]*?[:]//[^/]*?/.*?covid'), respSize, 0)), COUNTIF(REGEXP_CONTAINS(LOWER(url), r'[^/]*?[:]//[^/]*?/.*?covid'))) AS avg_size_covid | ||
FROM | ||
`httparchive.almanac.requests` | ||
WHERE | ||
date = '2021-08-01' AND | ||
type = 'image' | ||
GROUP BY | ||
client, | ||
ext | ||
HAVING | ||
ext_count_covid > 100 | ||
ORDER BY | ||
client, | ||
ext_count_covid DESC; |
Oops, something went wrong.