Skip to content
This repository has been archived by the owner on Oct 16, 2024. It is now read-only.

Snippet Removal Part 1— TSV generation to download_storages #1932

Draft
wants to merge 3 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions capstone/capweb/templates/about.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ Cases published after 1922 do not include headnotes.
Here are some tsv-formatted spreadsheets with specific counts from our collection, and links to view those cases in the
API:

* [Case Count by Reporter Series]({% static "downloads/cases_by_reporter.tsv" %})
* [Case Count by Jurisdiction]({% static "downloads/cases_by_jurisdiction.tsv" %})
* [Case Count by Decision Date]({% static "downloads/cases_by_decision_date.tsv" %})
* [Case Count by Reporter Series]({% url "download-files" "cases_by_reporter.tsv" %})
* [Case Count by Jurisdiction]({% url "download-files" "cases_by_jurisdiction.tsv" %})
* [Case Count by Decision Date]({% url "download-files" "cases_by_decision_date.tsv" %})

# Digitization Process {: class="subtitle" data-toc-label='Digitization' }

Expand Down
7 changes: 6 additions & 1 deletion capstone/fabfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@

import capdb.tasks as tasks
from scripts import set_up_postgres, data_migrations, \
validate_private_volumes as validate_private_volumes_script, export, update_snippets
validate_private_volumes as validate_private_volumes_script, export, update_snippets, update_download_tsv
from scripts.helpers import copy_file, volume_barcode_from_folder, up_to_date_volumes, storage_lookup


Expand Down Expand Up @@ -842,9 +842,14 @@ def run_edit_script(script=None, dry_run='true', **kwargs):
else:
method(dry_run=dry_run, **kwargs)

@task
def update_files():
update_download_tsv.update_all()

@task
def update_all_snippets():
update_snippets.update_all()
update_files()

@task
def update_search_snippets():
Expand Down
32 changes: 0 additions & 32 deletions capstone/scripts/tests/test_snippet_updaters.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,38 +14,6 @@ def test_map_numbers(case_factory, jurisdiction):
assert parsed[jurisdiction.slug]['volume_count'] == 3
assert parsed[jurisdiction.slug]['page_count'] == 15

@pytest.mark.django_db(databases=['capdb'])
def test_cases_by_decision_date(case_factory):
dates = ["2000", "2000-04", "2000-04", "2000-04-15"]
_ = [case_factory(decision_date_original=d) for d in dates]
update_snippets.cases_by_decision_date_tsv()
cases_by_decision_date = Snippet.objects.get(label='cases_by_decision_date')
assert cases_by_decision_date.contents == (
'"2000"\t4\t"https://api.case.test:8000/v1/cases/?decision_date__gte=2000&decision_date__lte=2000-12-31"\r\n'
'"2000-04"\t3\t"https://api.case.test:8000/v1/cases/?decision_date__gte=2000-04&decision_date__lte=2000-04-31"\r\n'
'"2000-04-15"\t1\t"https://api.case.test:8000/v1/cases/?decision_date__gte=2000-04-15&decision_date__lte=2000-04-15"\r\n'
)

@pytest.mark.django_db(databases=['capdb'])
def test_cases_by_jurisdiction(jurisdiction, case_factory):
[case_factory(jurisdiction=jurisdiction) for i in range(3)]
update_snippets.cases_by_jurisdiction_tsv()
cases_by_jurisdiction = Snippet.objects.get(label='cases_by_jurisdiction')
rows = cases_by_jurisdiction.contents.split("\r\n")[:-1]
assert len(rows) == 1
assert rows[0].split("\t")[1] == '"%s"' % jurisdiction.name_long
assert rows[0].split("\t")[2] == '3'

@pytest.mark.django_db(databases=['capdb'])
def test_cases_by_reporter(reporter, case_factory):
[case_factory(reporter=reporter) for i in range(3)]
update_snippets.cases_by_reporter_tsv()
cases_by_reporter = Snippet.objects.get(label='cases_by_reporter')
rows = cases_by_reporter.contents.split("\r\n")[:-1]
assert len(rows) == 1
assert rows[0].split("\t")[1] == '"%s"' % reporter.full_name
assert rows[0].split("\t")[2] == '3'

@pytest.mark.django_db(databases=['capdb'])
def test_search_jurisdiction_list(jurisdiction):
update_snippets.search_jurisdiction_list()
Expand Down
59 changes: 59 additions & 0 deletions capstone/scripts/tests/test_tsv_updaters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pytest

from django.core.files.storage import FileSystemStorage

from scripts import update_download_tsv
from capweb.templatetags.api_url import api_url


@pytest.mark.django_db(databases=['capdb'])
def test_cases_by_decision_date(case_factory, client, tmp_path, monkeypatch):
dates = ["2000", "2000-04", "2000-04", "2000-04-15"]
_ = [case_factory(decision_date_original=d) for d in dates]
monkeypatch.setattr("scripts.update_download_tsv.download_files_storage", FileSystemStorage(location=str(tmp_path)))
fs_storage = FileSystemStorage(location=str(tmp_path))

update_download_tsv.cases_by_decision_date_tsv()

file_contents = fs_storage.open(tmp_path / 'cases_by_decision_date.tsv').read()
correct_contents = bytes('"2000"\t4\t"https://api.case.test:8000/v1/cases/?decision_date__gte=2000&decision_date__lte=2000-12-31"\r\n'
'"2000-04"\t3\t"https://api.case.test:8000/v1/cases/?decision_date__gte=2000-04&decision_date__lte=2000-04-31"\r\n'
'"2000-04-15"\t1\t"https://api.case.test:8000/v1/cases/?decision_date__gte=2000-04-15&decision_date__lte=2000-04-15"\r\n' , encoding='utf-8')
assert file_contents == correct_contents

@pytest.mark.django_db(databases=['capdb'])
def test_cases_by_jurisdiction(jurisdiction, case_factory, client, tmp_path, monkeypatch):
[case_factory(jurisdiction=jurisdiction) for i in range(3)]
monkeypatch.setattr("scripts.update_download_tsv.download_files_storage", FileSystemStorage(location=str(tmp_path)))
fs_storage = FileSystemStorage(location=str(tmp_path))

update_download_tsv.cases_by_jurisdiction_tsv()

file_contents = fs_storage.open(tmp_path / 'cases_by_jurisdiction.tsv').read()
correct_contents = bytes('"{}"\t"{}"\t{}\t"{}"\t"{}"\r\n'.format(
jurisdiction.name,
jurisdiction.name_long,
3,
"{}?jurisdiction={}".format(api_url('cases-list'), jurisdiction.slug),
"{}{}".format(api_url('jurisdiction-list'), jurisdiction.pk)), encoding='utf-8')
assert file_contents == correct_contents


@pytest.mark.django_db(databases=['capdb'])
def test_cases_by_reporter(reporter, case_factory, client, tmp_path, monkeypatch):
[case_factory(reporter=reporter) for i in range(3)]
monkeypatch.setattr("scripts.update_download_tsv.download_files_storage", FileSystemStorage(location=str(tmp_path)))
fs_storage = FileSystemStorage(location=str(tmp_path))

update_download_tsv.cases_by_reporter_tsv()

file_contents = fs_storage.open(tmp_path / 'cases_by_reporter.tsv').read()
correct_contents = bytes('"{}"\t"{}"\t{}\t"{}"\t"{}"\r\n'.format(
reporter.short_name,
reporter.full_name,
3,
"{}?reporter={}".format(api_url('cases-list'), reporter.pk),
"{}{}".format(api_url('reporter-list'), reporter.pk)
), encoding='utf-8')
assert correct_contents in file_contents

115 changes: 115 additions & 0 deletions capstone/scripts/update_download_tsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import io
import csv
from collections import defaultdict

from django.db.models import Count, Q
from capdb.models import Reporter, Jurisdiction, CaseMetadata
from capweb.templatetags.api_url import api_url
from tqdm import tqdm
from capdb.storages import download_files_storage

def update_all():
cases_by_jurisdiction_tsv()
cases_by_reporter_tsv()
cases_by_decision_date_tsv()


def cases_by_decision_date_tsv(directory=""):
"""
count of all cases, grouped by decision date
"""
by_date = (CaseMetadata.objects
.in_scope()
.values('decision_date_original')
.annotate(Count('decision_date_original'))
.order_by('decision_date_original'))
label="cases_by_decision_date"
snippet_format="tsv"
output = io.StringIO()
writer = csv.writer(output, delimiter='\t', quoting=csv.QUOTE_NONNUMERIC)

# count dates
date_counter = defaultdict(int)
for group in tqdm(by_date):
date = group['decision_date_original']
count = group['decision_date_original__count']
# count year
date_counter[date[:4]] += count
# count year-month
if len(date) > 4:
date_counter[date[:7]] += count
# count year-month-day
if len(date) > 7:
date_counter[date] += count

# write dates
cases_url = api_url('cases-list')
for date, count in date_counter.items():
max_date = date + "0000-12-31"[len(date):]
writer.writerow([
date,
count,
f"{cases_url}?decision_date__gte={date}&decision_date__lte={max_date}",
])

write_update(label, snippet_format, output.getvalue(), directory=directory)


def cases_by_jurisdiction_tsv(directory=""):
"""
iterate through all reporters, tally each case, output TSV
"""
label="cases_by_jurisdiction"
snippet_format="tsv"
output = io.StringIO()
writer = csv.writer(output, delimiter='\t',quoting=csv.QUOTE_NONNUMERIC)
for jurisdiction in tqdm(Jurisdiction.objects.order_by('name').annotate(case_count=Count('case_metadatas', filter=Q(case_metadatas__in_scope=True)))):
if jurisdiction.case_count == 0:
continue
writer.writerow(
[
jurisdiction.name,
jurisdiction.name_long,
jurisdiction.case_count,
"{}?jurisdiction={}".format(api_url('cases-list'), jurisdiction.slug),
"{}{}".format(api_url('jurisdiction-list'), jurisdiction.pk)
]
)

write_update(label, snippet_format, output.getvalue(), directory=directory)


def cases_by_reporter_tsv(directory=""):
"""
iterate through all jurisdictions, tally each case, output TSV
"""
label="cases_by_reporter"
snippet_format="tsv"
output = io.StringIO()
writer = csv.writer(output, delimiter='\t',quoting=csv.QUOTE_NONNUMERIC)
for reporter in tqdm(Reporter.objects.order_by('full_name').annotate(case_count=Count('case_metadatas', filter=Q(case_metadatas__in_scope=True)))):
if reporter.case_count == 0:
continue
writer.writerow(
[
reporter.short_name,
reporter.full_name,
reporter.case_count,
"{}?reporter={}".format(api_url('cases-list'), reporter.pk),
"{}{}".format(api_url('reporter-list'), reporter.pk)
]
)

write_update(label, snippet_format, output.getvalue(), directory=directory)


def write_update(label, snippet_format, contents, directory):

file_name = "{}.{}".format(label, snippet_format)
full_path = "{}{}{}".format(directory, '/' if directory else '', file_name)

if directory and not download_files_storage.exists(directory):
download_files_storage.mkdir(directory, parents=True)

with download_files_storage.open(full_path, 'w') as d:
d.write(contents)
101 changes: 1 addition & 100 deletions capstone/scripts/update_snippets.py
Original file line number Diff line number Diff line change
@@ -1,113 +1,14 @@
import io
import csv
from collections import defaultdict

from django.db import connections
from django.db.models import Count, Q
from capdb.models import Reporter, Jurisdiction, CaseMetadata, Snippet, Court
from capdb.models import Reporter, Jurisdiction, Snippet, Court
import json
from capweb.templatetags.api_url import api_url
from tqdm import tqdm


def update_all():
update_map_numbers()
cases_by_jurisdiction_tsv()
cases_by_reporter_tsv()
cases_by_decision_date_tsv()
search_reporter_list()
search_court_list()
court_abbrev_list()
search_jurisdiction_list()

def cases_by_decision_date_tsv():
"""
count of all cases, grouped by decision date
"""
by_date = (CaseMetadata.objects
.in_scope()
.values('decision_date_original')
.annotate(Count('decision_date_original'))
.order_by('decision_date_original'))
label="cases_by_decision_date"
snippet_format="text/tab-separated-values"
output = io.StringIO()
writer = csv.writer(output, delimiter='\t', quoting=csv.QUOTE_NONNUMERIC)

# count dates
date_counter = defaultdict(int)
for group in tqdm(by_date):
print(group)
date = group['decision_date_original']
count = group['decision_date_original__count']
# count year
date_counter[date[:4]] += count
# count year-month
if len(date) > 4:
date_counter[date[:7]] += count
# count year-month-day
if len(date) > 7:
date_counter[date] += count

# write dates
cases_url = api_url('cases-list')
for date, count in date_counter.items():
max_date = date + "0000-12-31"[len(date):]
writer.writerow([
date,
count,
f"{cases_url}?decision_date__gte={date}&decision_date__lte={max_date}",
])

write_update(label, snippet_format, output.getvalue())

def cases_by_jurisdiction_tsv():
"""
iterate through all reporters, tally each case, output TSV
"""
label="cases_by_jurisdiction"
snippet_format="text/tab-separated-values"
output = io.StringIO()
writer = csv.writer(output, delimiter='\t',quoting=csv.QUOTE_NONNUMERIC)
for jurisdiction in tqdm(Jurisdiction.objects.order_by('name').annotate(case_count=Count('case_metadatas', filter=Q(case_metadatas__in_scope=True)))):
if jurisdiction.case_count == 0:
continue
writer.writerow(
[
jurisdiction.name,
jurisdiction.name_long,
jurisdiction.case_count,
"{}?jurisdiction={}".format(api_url('cases-list'), jurisdiction.slug),
"{}{}".format(api_url('jurisdiction-list'), jurisdiction.pk)
]
)

write_update(label, snippet_format, output.getvalue())


def cases_by_reporter_tsv():
"""
iterate through all jurisdictions, tally each case, output TSV
"""
label="cases_by_reporter"
snippet_format="text/tab-separated-values"
output = io.StringIO()
writer = csv.writer(output, delimiter='\t',quoting=csv.QUOTE_NONNUMERIC)
for reporter in tqdm(Reporter.objects.order_by('full_name').annotate(case_count=Count('case_metadatas', filter=Q(case_metadatas__in_scope=True)))):
if reporter.case_count == 0:
continue
writer.writerow(
[
reporter.short_name,
reporter.full_name,
reporter.case_count,
"{}?reporter={}".format(api_url('cases-list'), reporter.pk),
"{}{}".format(api_url('reporter-list'), reporter.pk)
]
)

write_update(label, snippet_format, output.getvalue())

def update_map_numbers():
""" Write map_numbers snippet. """
label = "map_numbers"
Expand Down
Loading