Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jobs export download #30

Open
wants to merge 8 commits into
base: flask-cors-fix
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 2 additions & 15 deletions client/src/components/SingleJob.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,6 @@ class SingleJob extends Component {
.then(res => this.updateStatus());
};

downloadFunc = () => {
fetchWithUserToken(`/api/jobs/${this.props.id}/files`, {method:"GET"})
.then(response => response.blob())
.then(blob => {
let url = window.URL.createObjectURL(blob);
let a = document.createElement("a");
a.href = url;
a.download = this.props.id + ".zip";
document.body.appendChild(a); // we need to append the element to the dom -> otherwise it will not work in firefox
a.click();
a.remove(); //afterwards we remove the element again
});
};

updateStatus = () => {
fetchWithUserToken(`/api/jobs/${this.props.id}`, {method:"GET"})
.then(res => {
Expand Down Expand Up @@ -89,7 +75,8 @@ class SingleJob extends Component {
<CardActions><TopButton
variant="extended"
color="primary"
onClick={this.downloadFunc}>
href={`/api/jobs/${this.props.id}/download`}
>
Download
</TopButton></CardActions>
</div>,
Expand Down
180 changes: 180 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
name: wc-server
channels:
- defaults
dependencies:
- bzip2=1.0.8=h1de35cc_0
- ca-certificates=2021.10.26=hecd8cb5_2
- libcxx=12.0.0=h2f01273_0
- libffi=3.3=hb1e8313_2
- ncurses=6.3=hca72f7f_2
- openssl=1.1.1m=hca72f7f_0
- pip=21.2.4=py310hecd8cb5_0
- python=3.10.0=hdfd78df_3
- readline=8.1.2=hca72f7f_1
- setuptools=58.0.4=py310hecd8cb5_0
- sqlite=3.37.0=h707629a_0
- tk=8.6.11=h7bc2e8c_0
- tzdata=2021e=hda174b7_0
- xz=5.2.5=h1de35cc_0
- zlib=1.2.11=h4dc903c_4
- pip:
- aniso8601==9.0.1
- anyio==3.5.0
- appnope==0.1.2
- argcomplete==1.10.0
- argon2-cffi==21.3.0
- argon2-cffi-bindings==21.2.0
- asttokens==2.0.5
- attrs==20.3.0
- authlib==0.15.5
- automat==20.2.0
- babel==2.9.1
- backcall==0.2.0
- beautifulsoup4==4.8.0
- billiard==3.6.3.0
- black==22.1.0
- bleach==4.1.0
- cachetools==5.0.0
- certifi==2020.12.5
- cffi==1.14.4
- chardet==3.0.4
- charset-normalizer==2.0.11
- click==8.0.3
- constantly==15.1.0
- cryptography==3.3.2
- cssselect==1.1.0
- debugpy==1.5.1
- decorator==5.1.1
- defusedxml==0.7.1
- deprecated==1.2.13
- docx2txt==0.8
- ebooklib==0.17.1
- entrypoints==0.4
- executing==0.8.2
- extract-msg==0.23.1
- filelock==3.0.12
- flask==1.1.2
- flask-restful==0.3.8
- google==3.0.0
- google-api-core==2.4.0
- google-api-python-client==2.33.0
- google-auth==2.6.0
- google-auth-httplib2==0.1.0
- googleapis-common-protos==1.54.0
- h2==3.2.0
- hpack==3.0.0
- html5lib==1.1
- httplib2==0.20.2
- hyperframe==5.2.0
- hyperlink==20.0.1
- idna==2.10
- imapclient==2.1.0
- incremental==17.5.0
- ipykernel==6.8.0
- ipython==8.0.1
- ipython-genutils==0.2.0
- itemadapter==0.2.0
- itemloaders==1.0.4
- itsdangerous==2.0.1
- jedi==0.18.1
- jinja2==3.0.3
- jmespath==0.10.0
- json5==0.9.6
- jsonschema==4.4.0
- jupyter-client==7.1.2
- jupyter-core==4.9.1
- jupyter-server==1.13.4
- jupyterlab==3.2.9
- jupyterlab-pygments==0.1.2
- jupyterlab-server==2.10.3
- lxml==4.6.5
- markupsafe==2.0.1
- matplotlib-inline==0.1.3
- mistune==0.8.4
- mypy-extensions==0.4.3
- nbclassic==0.3.5
- nbclient==0.5.10
- nbconvert==6.4.1
- nbformat==5.1.3
- nest-asyncio==1.5.4
- notebook==6.4.8
- numpy==1.22.2
- olefile==0.46
- packaging==21.3
- pandas==1.4.0
- pandocfilters==1.5.0
- parsel==1.6.0
- parso==0.8.3
- pathspec==0.9.0
- pdfminer-six==20181108
- pexpect==4.8.0
- pickleshare==0.7.5
- pillow==9.0.0
- platformdirs==2.4.1
- priority==1.3.0
- prometheus-client==0.13.1
- prompt-toolkit==3.0.26
- protego==0.1.16
- protobuf==3.19.4
- ptyprocess==0.7.0
- pure-eval==0.2.2
- pyasn1==0.4.8
- pyasn1-modules==0.2.8
- pycparser==2.20
- pycryptodome==3.9.9
- pydispatcher==2.0.5
- pygments==2.11.2
- pyhamcrest==2.0.2
- pymongo==3.11.2
- pyopenssl==20.0.0
- pyparsing==3.0.7
- pyrsistent==0.18.1
- python-dateutil==2.8.2
- python-dotenv==0.19.2
- python-pptx==0.6.18
- pytz==2020.4
- pyzmq==22.3.0
- queuelib==1.5.0
- redis==4.1.2
- regex==2020.11.13
- requests==2.25.0
- requests-file==1.5.1
- rq==1.8.1
- rsa==4.8
- scrapy==2.5.1
- scrapyscript==1.1.0
- send2trash==1.8.0
- service-identity==18.1.0
- six==1.12.0
- sniffio==1.2.0
- sortedcontainers==2.3.0
- soupsieve==2.0.1
- speechrecognition==3.8.1
- stack-data==0.1.4
- terminado==0.13.1
- testpath==0.5.0
- textract==1.6.3
- tldextract==3.1.0
- tomli==2.0.0
- tornado==6.1
- traitlets==5.1.1
- twisted==20.3.0
- tzlocal==1.5.1
- uritemplate==4.1.1
- urllib3==1.26.5
- w3lib==1.22.0
- wcwidth==0.2.5
- webencodings==0.5.1
- websocket-client==1.2.3
- werkzeug==2.0.2
- wheel==0.37.1
- wrapt==1.13.3
- xlrd==1.2.0
- xlsxwriter==1.3.7
- zope-interface==5.2.0
variables:
CLIENT_ORIGIN: http://localhost:3000
MONGO_URI: mongodb://localhost:27017
REACT_APP_SERVER_URL: http://localhost:5000
SERVER_PORT: '5000'
prefix: /Users/miclin/opt/anaconda3/envs/wc-server
4 changes: 3 additions & 1 deletion server/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@ def create_app(test_config=None):
app.register_blueprint(job_interfaces.bp)
from .jobs import actions as job_actions
app.register_blueprint(job_actions.bp)
from .jobs import download as download
app.register_blueprint(download.bp)

return app


def validate_server_settings():
assert (GOOGLE_OAUTH_CLIENT_URL is not None, "Missing Google oauth client ID")
assert (GOOGLE_OAUTH_CLIENT_ID is not None, "Missing Google oauth client ID")
assert (FLASK_ENV != "production" or not DEBUG_NO_AUTH_ENABLED,
"No auth mode mustn't be enabled for production environment")
if DEBUG_NO_AUTH_ENABLED:
Expand Down
41 changes: 0 additions & 41 deletions server/crawler/getzip.py

This file was deleted.

1 change: 1 addition & 0 deletions server/crawler/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from scrapy.item import Item, Field


class CrawlerItem(Item):
"""
Represents the object parsed by the spider.
Expand Down
52 changes: 52 additions & 0 deletions server/crawler/output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import shutil

import pymongo
import gridfs
import os
import csv
import server.settings as settings
from shutil import make_archive
from tempfile import TemporaryDirectory
from bson import ObjectId

db = pymongo.MongoClient(
settings.MONGO_URI, username=settings.MONGO_USERNAME,
password=settings.MONGO_PASSWORD
)[settings.MONGO_DB]


def bucket_export(folder, bucket_name, job_id):
bucket_folder_path = os.path.join(folder, f"{bucket_name}/")
os.makedirs(os.path.dirname(bucket_folder_path), exist_ok=True)
fs = gridfs.GridFSBucket(db, bucket_name)
for f in fs.find({"job_id": job_id}):
with open(os.path.join(bucket_folder_path, f.filename), "wb") \
as local_file:
fs.download_to_stream(f._id, local_file)


def items_export(folder, job_id):
docs = db[settings.MONGO_COLLECTION_ITEMS]\
.find({"job_id": job_id})

if docs.count() == 0:
return

fields = list(docs[0].keys())
output_file = os.path.join(folder, f"{job_id}.csv")
with open(output_file, "w", newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fields, extrasaction="ignore")
writer.writeheader()
writer.writerows(docs)


def get_zip(job_id: str) -> str:
dir_path = os.path.dirname(os.path.realpath(__file__))
result_file_path = os.path.join(dir_path, "results", job_id)
with TemporaryDirectory() as temp_dir:
bucket_export(temp_dir, settings.MONGO_BUCKET_IMAGES, job_id)
bucket_export(temp_dir, settings.MONGO_BUCKET_FILES, job_id)
items_export(temp_dir, job_id)
shutil.make_archive(result_file_path, "zip", temp_dir)

return f"{result_file_path}.zip"
28 changes: 12 additions & 16 deletions server/crawler/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,10 @@


class MongoDBPipeline:
COLLECTION = "items"
BUCKET_COLLECTION = "bucketItems"
OTHER_COLLECTION = "otherItems"

def __init__(self, mongo_uri, mongo_db, mongo_user='admin', mongo_pwd='', mongo_repl=False, mongo_repl_name=''):
self.client = None
self.db = None
self.grid_fs = None
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.mongo_user = mongo_user
Expand Down Expand Up @@ -71,8 +67,6 @@ def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri, username=self.mongo_user,
password=self.mongo_password)
self.db = self.client[self.mongo_db]
self.grid_fs = gridfs.GridFS(self.db,
collection=self.BUCKET_COLLECTION)
print("Connected")

def close_spider(self, spider):
Expand All @@ -98,23 +92,24 @@ def process_item(self, item, spider):
query = {'url': item['url']}

if not isinstance(item, CrawlerItem):
print("Not an instance of CrawlerItem")
print(item['url'])
self.db[self.OTHER_COLLECTION].replace_one(query, adapted_item, upsert=True)
logging.debug("Not an instance of CrawlerItem")
logging.debug(item['url'])
self.db[settings.MONGO_COLLECTION_OTHERS]\
.replace_one(query, adapted_item, upsert=True)
return item

# upsert=True means insert the document if the query doesn't find a match.
self.db[self.COLLECTION].replace_one(
self.db[settings.MONGO_COLLECTION_ITEMS].replace_one(
query, adapted_item, upsert=True
)

urls = item["image_urls"]
if type(urls) is list and len(urls) != 0:
self.save_to_bucket(urls, "images", spider)
self.save_to_bucket(urls, settings.MONGO_BUCKET_IMAGES, spider)

urls = item["file_urls"]
if type(urls) is list and len(urls) != 0:
self.save_to_bucket(urls, "files", spider)
self.save_to_bucket(urls, settings.MONGO_BUCKET_FILES, spider)

logging.debug(f"MongoDB: Inserted {item['url']}.")
return item
Expand All @@ -123,7 +118,8 @@ def save_to_bucket(self, urls, bucket_name, spider):
for url in urls:
mime_type = mimetypes.guess_type(url)[0]
request = requests.get(url, stream=True)
self.grid_fs.put(request.raw, contentType=mime_type,
user=spider.user if hasattr(spider, "user") else None,
job_id=spider.job_id if hasattr(spider, "job_id") else None,
filename=os.path.basename(url), bucket_name=bucket_name)
fs = gridfs.GridFS(self.db, bucket_name)
fs.put(request.raw, contentType=mime_type,
user=spider.user if hasattr(spider, "user") else None,
job_id=spider.job_id if hasattr(spider, "job_id") else None,
filename=os.path.basename(url), bucket_name=bucket_name)
Loading