comp-strat · Miclin1024 · Apr 13, 2022 · Apr 13, 2022 · Apr 13, 2022 · Apr 13, 2022
diff --git a/client/src/components/SingleJob.js b/client/src/components/SingleJob.js
@@ -22,20 +22,6 @@ class SingleJob extends Component {
       .then(res => this.updateStatus());
   };
 
-  downloadFunc = () => {
-    fetchWithUserToken(`/api/jobs/${this.props.id}/files`, {method:"GET"})
-      .then(response => response.blob())
-      .then(blob => {
-        let url = window.URL.createObjectURL(blob);
-        let a = document.createElement("a");
-        a.href = url;
-        a.download = this.props.id + ".zip";
-        document.body.appendChild(a); // we need to append the element to the dom -> otherwise it will not work in firefox
-        a.click();    
-        a.remove();  //afterwards we remove the element again         
-      });
-  };
-
   updateStatus = () => {
     fetchWithUserToken(`/api/jobs/${this.props.id}`, {method:"GET"})
       .then(res => {
@@ -89,7 +75,8 @@ class SingleJob extends Component {
               <CardActions><TopButton
                 variant="extended"
                 color="primary"
-                onClick={this.downloadFunc}>
+                href={`/api/jobs/${this.props.id}/download`}
+              >
                                 Download
               </TopButton></CardActions>
             </div>,

diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,180 @@
+name: wc-server
+channels:
+  - defaults
+dependencies:
+  - bzip2=1.0.8=h1de35cc_0
+  - ca-certificates=2021.10.26=hecd8cb5_2
+  - libcxx=12.0.0=h2f01273_0
+  - libffi=3.3=hb1e8313_2
+  - ncurses=6.3=hca72f7f_2
+  - openssl=1.1.1m=hca72f7f_0
+  - pip=21.2.4=py310hecd8cb5_0
+  - python=3.10.0=hdfd78df_3
+  - readline=8.1.2=hca72f7f_1
+  - setuptools=58.0.4=py310hecd8cb5_0
+  - sqlite=3.37.0=h707629a_0
+  - tk=8.6.11=h7bc2e8c_0
+  - tzdata=2021e=hda174b7_0
+  - xz=5.2.5=h1de35cc_0
+  - zlib=1.2.11=h4dc903c_4
+  - pip:
+    - aniso8601==9.0.1
+    - anyio==3.5.0
+    - appnope==0.1.2
+    - argcomplete==1.10.0
+    - argon2-cffi==21.3.0
+    - argon2-cffi-bindings==21.2.0
+    - asttokens==2.0.5
+    - attrs==20.3.0
+    - authlib==0.15.5
+    - automat==20.2.0
+    - babel==2.9.1
+    - backcall==0.2.0
+    - beautifulsoup4==4.8.0
+    - billiard==3.6.3.0
+    - black==22.1.0
+    - bleach==4.1.0
+    - cachetools==5.0.0
+    - certifi==2020.12.5
+    - cffi==1.14.4
+    - chardet==3.0.4
+    - charset-normalizer==2.0.11
+    - click==8.0.3
+    - constantly==15.1.0
+    - cryptography==3.3.2
+    - cssselect==1.1.0
+    - debugpy==1.5.1
+    - decorator==5.1.1
+    - defusedxml==0.7.1
+    - deprecated==1.2.13
+    - docx2txt==0.8
+    - ebooklib==0.17.1
+    - entrypoints==0.4
+    - executing==0.8.2
+    - extract-msg==0.23.1
+    - filelock==3.0.12
+    - flask==1.1.2
+    - flask-restful==0.3.8
+    - google==3.0.0
+    - google-api-core==2.4.0
+    - google-api-python-client==2.33.0
+    - google-auth==2.6.0
+    - google-auth-httplib2==0.1.0
+    - googleapis-common-protos==1.54.0
+    - h2==3.2.0
+    - hpack==3.0.0
+    - html5lib==1.1
+    - httplib2==0.20.2
+    - hyperframe==5.2.0
+    - hyperlink==20.0.1
+    - idna==2.10
+    - imapclient==2.1.0
+    - incremental==17.5.0
+    - ipykernel==6.8.0
+    - ipython==8.0.1
+    - ipython-genutils==0.2.0
+    - itemadapter==0.2.0
+    - itemloaders==1.0.4
+    - itsdangerous==2.0.1
+    - jedi==0.18.1
+    - jinja2==3.0.3
+    - jmespath==0.10.0
+    - json5==0.9.6
+    - jsonschema==4.4.0
+    - jupyter-client==7.1.2
+    - jupyter-core==4.9.1
+    - jupyter-server==1.13.4
+    - jupyterlab==3.2.9
+    - jupyterlab-pygments==0.1.2
+    - jupyterlab-server==2.10.3
+    - lxml==4.6.5
+    - markupsafe==2.0.1
+    - matplotlib-inline==0.1.3
+    - mistune==0.8.4
+    - mypy-extensions==0.4.3
+    - nbclassic==0.3.5
+    - nbclient==0.5.10
+    - nbconvert==6.4.1
+    - nbformat==5.1.3
+    - nest-asyncio==1.5.4
+    - notebook==6.4.8
+    - numpy==1.22.2
+    - olefile==0.46
+    - packaging==21.3
+    - pandas==1.4.0
+    - pandocfilters==1.5.0
+    - parsel==1.6.0
+    - parso==0.8.3
+    - pathspec==0.9.0
+    - pdfminer-six==20181108
+    - pexpect==4.8.0
+    - pickleshare==0.7.5
+    - pillow==9.0.0
+    - platformdirs==2.4.1
+    - priority==1.3.0
+    - prometheus-client==0.13.1
+    - prompt-toolkit==3.0.26
+    - protego==0.1.16
+    - protobuf==3.19.4
+    - ptyprocess==0.7.0
+    - pure-eval==0.2.2
+    - pyasn1==0.4.8
+    - pyasn1-modules==0.2.8
+    - pycparser==2.20
+    - pycryptodome==3.9.9
+    - pydispatcher==2.0.5
+    - pygments==2.11.2
+    - pyhamcrest==2.0.2
+    - pymongo==3.11.2
+    - pyopenssl==20.0.0
+    - pyparsing==3.0.7
+    - pyrsistent==0.18.1
+    - python-dateutil==2.8.2
+    - python-dotenv==0.19.2
+    - python-pptx==0.6.18
+    - pytz==2020.4
+    - pyzmq==22.3.0
+    - queuelib==1.5.0
+    - redis==4.1.2
+    - regex==2020.11.13
+    - requests==2.25.0
+    - requests-file==1.5.1
+    - rq==1.8.1
+    - rsa==4.8
+    - scrapy==2.5.1
+    - scrapyscript==1.1.0
+    - send2trash==1.8.0
+    - service-identity==18.1.0
+    - six==1.12.0
+    - sniffio==1.2.0
+    - sortedcontainers==2.3.0
+    - soupsieve==2.0.1
+    - speechrecognition==3.8.1
+    - stack-data==0.1.4
+    - terminado==0.13.1
+    - testpath==0.5.0
+    - textract==1.6.3
+    - tldextract==3.1.0
+    - tomli==2.0.0
+    - tornado==6.1
+    - traitlets==5.1.1
+    - twisted==20.3.0
+    - tzlocal==1.5.1
+    - uritemplate==4.1.1
+    - urllib3==1.26.5
+    - w3lib==1.22.0
+    - wcwidth==0.2.5
+    - webencodings==0.5.1
+    - websocket-client==1.2.3
+    - werkzeug==2.0.2
+    - wheel==0.37.1
+    - wrapt==1.13.3
+    - xlrd==1.2.0
+    - xlsxwriter==1.3.7
+    - zope-interface==5.2.0
+variables:
+  CLIENT_ORIGIN: http://localhost:3000
+  MONGO_URI: mongodb://localhost:27017
+  REACT_APP_SERVER_URL: http://localhost:5000
+  SERVER_PORT: '5000'
+prefix: /Users/miclin/opt/anaconda3/envs/wc-server
diff --git a/server/__init__.py b/server/__init__.py
@@ -18,12 +18,14 @@ def create_app(test_config=None):
     app.register_blueprint(job_interfaces.bp)
     from .jobs import actions as job_actions
     app.register_blueprint(job_actions.bp)
+    from .jobs import download as download
+    app.register_blueprint(download.bp)
 
     return app
 
 
 def validate_server_settings():
-    assert (GOOGLE_OAUTH_CLIENT_URL is not None, "Missing Google oauth client ID")
+    assert (GOOGLE_OAUTH_CLIENT_ID is not None, "Missing Google oauth client ID")
     assert (FLASK_ENV != "production" or not DEBUG_NO_AUTH_ENABLED,
             "No auth mode mustn't be enabled for production environment")
     if DEBUG_NO_AUTH_ENABLED:

diff --git a/server/crawler/getzip.py b/server/crawler/getzip.py
diff --git a/server/crawler/items.py b/server/crawler/items.py
@@ -5,6 +5,7 @@
 
 from scrapy.item import Item, Field
 
+
 class CrawlerItem(Item):
     """
     Represents the object parsed by the spider.

diff --git a/server/crawler/output.py b/server/crawler/output.py
@@ -0,0 +1,52 @@
+import shutil
+
+import pymongo
+import gridfs
+import os
+import csv
+import server.settings as settings
+from shutil import make_archive
+from tempfile import TemporaryDirectory
+from bson import ObjectId
+
+db = pymongo.MongoClient(
+    settings.MONGO_URI, username=settings.MONGO_USERNAME,
+    password=settings.MONGO_PASSWORD
+)[settings.MONGO_DB]
+
+
+def bucket_export(folder, bucket_name, job_id):
+    bucket_folder_path = os.path.join(folder, f"{bucket_name}/")
+    os.makedirs(os.path.dirname(bucket_folder_path), exist_ok=True)
+    fs = gridfs.GridFSBucket(db, bucket_name)
+    for f in fs.find({"job_id": job_id}):
+        with open(os.path.join(bucket_folder_path, f.filename), "wb") \
+                as local_file:
+            fs.download_to_stream(f._id, local_file)
+
+
+def items_export(folder, job_id):
+    docs = db[settings.MONGO_COLLECTION_ITEMS]\
+        .find({"job_id": job_id})
+
+    if docs.count() == 0:
+        return
+
+    fields = list(docs[0].keys())
+    output_file = os.path.join(folder, f"{job_id}.csv")
+    with open(output_file, "w", newline='') as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=fields, extrasaction="ignore")
+        writer.writeheader()
+        writer.writerows(docs)
+
+
+def get_zip(job_id: str) -> str:
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    result_file_path = os.path.join(dir_path, "results", job_id)
+    with TemporaryDirectory() as temp_dir:
+        bucket_export(temp_dir, settings.MONGO_BUCKET_IMAGES, job_id)
+        bucket_export(temp_dir, settings.MONGO_BUCKET_FILES, job_id)
+        items_export(temp_dir, job_id)
+        shutil.make_archive(result_file_path, "zip", temp_dir)
+
+    return f"{result_file_path}.zip"
diff --git a/server/crawler/pipelines.py b/server/crawler/pipelines.py
@@ -32,14 +32,10 @@
 
 
 class MongoDBPipeline:
-    COLLECTION = "items"
-    BUCKET_COLLECTION = "bucketItems"
-    OTHER_COLLECTION = "otherItems"
 
     def __init__(self, mongo_uri, mongo_db, mongo_user='admin', mongo_pwd='', mongo_repl=False, mongo_repl_name=''):
         self.client = None
         self.db = None
-        self.grid_fs = None
         self.mongo_uri = mongo_uri
         self.mongo_db = mongo_db
         self.mongo_user = mongo_user
@@ -71,8 +67,6 @@ def open_spider(self, spider):
             self.client = pymongo.MongoClient(self.mongo_uri, username=self.mongo_user,
                                               password=self.mongo_password)
         self.db = self.client[self.mongo_db]
-        self.grid_fs = gridfs.GridFS(self.db,
-                                     collection=self.BUCKET_COLLECTION)
         print("Connected")
 
     def close_spider(self, spider):
@@ -98,23 +92,24 @@ def process_item(self, item, spider):
         query = {'url': item['url']}
 
         if not isinstance(item, CrawlerItem):
-            print("Not an instance of CrawlerItem")
-            print(item['url'])
-            self.db[self.OTHER_COLLECTION].replace_one(query, adapted_item, upsert=True)
+            logging.debug("Not an instance of CrawlerItem")
+            logging.debug(item['url'])
+            self.db[settings.MONGO_COLLECTION_OTHERS]\
+                .replace_one(query, adapted_item, upsert=True)
             return item
 
         # upsert=True means insert the document if the query doesn't find a match.
-        self.db[self.COLLECTION].replace_one(
+        self.db[settings.MONGO_COLLECTION_ITEMS].replace_one(
             query, adapted_item, upsert=True
         )
 
         urls = item["image_urls"]
         if type(urls) is list and len(urls) != 0:
-            self.save_to_bucket(urls, "images", spider)
+            self.save_to_bucket(urls, settings.MONGO_BUCKET_IMAGES, spider)
 
         urls = item["file_urls"]
         if type(urls) is list and len(urls) != 0:
-            self.save_to_bucket(urls, "files", spider)
+            self.save_to_bucket(urls, settings.MONGO_BUCKET_FILES, spider)
 
         logging.debug(f"MongoDB: Inserted {item['url']}.")
         return item
@@ -123,7 +118,8 @@ def save_to_bucket(self, urls, bucket_name, spider):
         for url in urls:
             mime_type = mimetypes.guess_type(url)[0]
             request = requests.get(url, stream=True)
-            self.grid_fs.put(request.raw, contentType=mime_type,
-                             user=spider.user if hasattr(spider, "user") else None,
-                             job_id=spider.job_id if hasattr(spider, "job_id") else None,
-                             filename=os.path.basename(url), bucket_name=bucket_name)
+            fs = gridfs.GridFS(self.db, bucket_name)
+            fs.put(request.raw, contentType=mime_type,
+                   user=spider.user if hasattr(spider, "user") else None,
+                   job_id=spider.job_id if hasattr(spider, "job_id") else None,
+                   filename=os.path.basename(url), bucket_name=bucket_name)
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ @@
     from scrapy.item import Item, Field
     class CrawlerItem(Item):
         """
         Represents the object parsed by the spider.
@@ Expand Down @@