-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrequest.py
149 lines (124 loc) · 5.42 KB
/
request.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import configparser
import math
import sys
from zeep import Client
from zeep import xsd
from bs4 import BeautifulSoup as bs
from mongo import insert_doc
from plugin import prevent_escaping_characters_in_cdata
from request_parser import parse_response_for_mongo
import csv_export
WRITE_TO_FILE_DEBUG = 0
config = configparser.ConfigParser()
config.read("eur_lex.ini")
client = Client(
wsdl="https://eur-lex.europa.eu/eurlex-ws?wsdl",
plugins=[prevent_escaping_characters_in_cdata()],
)
# Soap headers are not correctly defined inside the wsdl - define them hear manually
header = xsd.Element(
"{http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-secext-1.0.xsd}Security",
xsd.ComplexType(
[
xsd.Element(
"{http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-secext-1.0.xsd}UsernameToken",
xsd.ComplexType(
[
xsd.Element(
"{http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-secext-1.0.xsd}Username",
xsd.String(),
),
xsd.Element(
"{http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-secext-1.0.xsd}Password",
xsd.String(),
),
]
),
),
]
),
)
header_value = header(
UsernameToken={
"Username": config.get("eur-lex", "username"),
"Password": config.get("eur-lex", "password"),
}
)
# all document languages available in eur-lex
# for a descriptive list, see 13. in https://eur-lex.europa.eu/content/help/faq/intro.html
AVAILABLE_LANGUAGES = ['bg', 'hr', 'cs', 'da', 'nl', 'en', 'et', 'fi', 'fr', 'de', 'el', 'hu',
'ga', 'it', 'lv', 'lt', 'mt', 'pl', 'pt', 'ro', 'sk', 'sl', 'es', 'sv']
def request_data(page_size=1, page=1, language='en'):
# Excute the query - zeep automatically generates an object with the doQuery property defined by eur-lex
# We need to use a raw request here (https://stackoverflow.com/questions/57730340/how-to-fix-str-object-has-no-attribute-keys-in-python-zeep-module)
with client.settings(raw_response=True):
response = client.service.doQuery(
expertQuery="<![CDATA[SELECT TI_DISPLAY, TE, IX, I1, I2, VS , MO, CO, DI, DN, AU, CT, RJ, RJ_NEW, ECLI, DD, AJ, LB, AP, DF, CD, PR WHERE DTS_SUBDOM = EU_CASE_LAW AND (EMBEDDED_MANIFESTATION-TYPE = html OR xhtml or pdf) AND CASE_LAW_SUMMARY = false AND (DTT=C? AND DTS = 6) AND (FM_CODED = JUDG) ORDER BY DD ASC]]>",
page=page,
pageSize=page_size,
searchLanguage=language,
_soapheaders=[header_value],
)
return response
def request_all_data_for_language(lang, to_csv):
# Find out how much data we need to crawl for our query. Doesn't really make sense to convert to a dict here.
response = request_data(1, 1, lang)
root = bs(response.content, "lxml", from_encoding="UTF-8")
total_documents = root.find("totalhits").text
for i in range(1, math.ceil(int(total_documents) / 100) + 1):
response = request_data(page_size=100, page=i, language=lang)
docs = parse_response_for_mongo(response)
if to_csv:
csv_export.export_to_csv(docs, "corpus_" + lang + ".csv")
else:
for doc in docs:
insert_doc(doc, lang)
# This function can be called to crawl judgments at once automatically. Use with caution!
# accepts a single language or an array of languages to request all docs for. default: only english
def request_all_data(languages=['en'], to_csv=False):
if isinstance(languages, list):
for current_lang in languages:
if current_lang not in AVAILABLE_LANGUAGES:
print("REQUEST: '", current_lang, "' is not a valid language.")
else:
request_all_data_for_language(current_lang, to_csv)
elif type(languages, str):
request_all_data_for_language(languages, to_csv)
else:
print("REQUEST: request_all_data(): the specified argument is not of type 'list' or 'str'")
def main():
_page = 1
_page_size = 10
_dump_mode = None
_debug_mode = False
_language = 'en'
for arg in sys.argv[1:]:
if arg == "debug":
_debug_mode = True
elif arg.startswith("page="):
val = arg.split("page=", 1)[1]
_page = val
elif arg.startswith("pagesize="):
val = arg.split("pagesize=", 1)[1]
_page_size = val
elif arg.startswith("dump="):
val = arg.split("dump=", 1)[1]
if val == "all" or val == "parse" or val == "response":
_dump_mode = val
response = request_data( _page_size, _page, _language)
docs = parse_response_for_mongo(
response, debug_mode=_debug_mode, dump_mode=_dump_mode
)
for doc in docs:
insert_doc(doc, _language)
if WRITE_TO_FILE_DEBUG:
from bs4 import BeautifulSoup as bs
root = bs(response.content, "lxml", from_encoding="UTF-8")
response_file = open("response.txt", "w+", encoding="UTF8")
response_file.write(str(root.prettify()))
# if __name__ == "__main__":
# # main()
# response = request_data(20, 1, 'en')
# docs = parse_response_for_mongo(response)
# for doc in docs:
# csv_export.export_to_csv(doc)