Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PEP 8 formatting #7

Merged
merged 1 commit into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 49 additions & 27 deletions downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,29 @@

# additional checks for imghdr.what()
def test_html(h, f):
if b'<html' in h:
return 'html'
if b"<html" in h:
return "html"


imghdr.tests.append(test_html)


def test_xml(h, f):
if b'<xml' in h:
return 'xml'
if b"<xml" in h:
return "xml"


imghdr.tests.append(test_xml)


def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, proxy=None):
def download_image(
image_url, dst_dir, file_name, timeout=20, proxy_type=None, proxy=None
):
proxies = None
if proxy_type is not None:
proxies = {
"http": proxy_type + "://" + proxy,
"https": proxy_type + "://" + proxy
"https": proxy_type + "://" + proxy,
}

response = None
Expand All @@ -50,39 +54,40 @@ def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, p
while True:
try:
try_times += 1
#= image_url = image_url.split('&amp;')[0] # https://github.com/pablobots/Image-Downloader/commit/5bdbe076589459b9d0c41a563b92993cac1a892e
# = image_url = image_url.split('&amp;')[0] # https://github.com/pablobots/Image-Downloader/commit/5bdbe076589459b9d0c41a563b92993cac1a892e
response = requests.get(
image_url, headers=headers, timeout=timeout, proxies=proxies)
with open(file_path, 'wb') as f:
image_url, headers=headers, timeout=timeout, proxies=proxies
)
with open(file_path, "wb") as f:
f.write(response.content)
response.close()

file_type = imghdr.what(file_path)

if file_name.endswith('.jpeg'):
file_name = file_name.replace('.jpeg', '.jpg')
if file_name.endswith(".jpeg"):
file_name = file_name.replace(".jpeg", ".jpg")

if file_type == 'jpeg':
file_type = 'jpg'
if file_type == "jpeg":
file_type = "jpg"

if file_type is None:
# os.remove(file_path)
print("## Err: TYPE({}) {}".format(file_type, file_name))
return False
elif file_type == 'html' or file_type == 'xml':
elif file_type == "html" or file_type == "xml":
os.remove(file_path)
print("## Err: TYPE({}) {}".format(file_type, image_url))
return False
elif file_type in ["jpg", "jpeg", "png", "bmp", "webp"]:
if len(file_name) >= 200:
print("Truncating: {}".format(file_name))
file_name = file_name[:200]

if file_name.endswith("." + file_type):
new_file_name = file_name
else:
else:
new_file_name = "{}.{}".format(file_name, file_type)

new_file_path = os.path.join(dst_dir, new_file_name)
shutil.move(file_path, new_file_path)
print("## OK: {} {}".format(new_file_name, image_url))
Expand All @@ -92,7 +97,7 @@ def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, p
print("## Err: TYPE({}) {}".format(file_type, image_url))
return False
break

except Exception as e:
if try_times < 3:
file_name = file_name + "a"
Expand All @@ -104,7 +109,15 @@ def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, p
break


def download_images(image_urls, dst_dir, file_prefix="img", concurrency=50, timeout=20, proxy_type=None, proxy=None):
def download_images(
image_urls,
dst_dir,
file_prefix="img",
concurrency=50,
timeout=20,
proxy_type=None,
proxy=None,
):
"""
Download image according to given urls and automatically rename them in order.
:param timeout:
Expand All @@ -123,19 +136,28 @@ def download_images(image_urls, dst_dir, file_prefix="img", concurrency=50, time
future_list = list()
count = 0
success_downloads = 0

if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
for image_url in image_urls:
# file_name = file_prefix + "_" + "%04d" % count
print("## URL : {}".format(image_url))
file_name = image_url
file_name = split_string(file_name, '?', 0)
file_name = split_string(file_name, '&amp;', 0)
file_name = split_string(file_name, '/', -1)
file_name = split_string(file_name, "?", 0)
file_name = split_string(file_name, "&amp;", 0)
file_name = split_string(file_name, "/", -1)
print("## FILE: {}".format(file_name))
future_list.append(executor.submit(
download_image, image_url, dst_dir, file_name, timeout, proxy_type, proxy))
future_list.append(
executor.submit(
download_image,
image_url,
dst_dir,
file_name,
timeout,
proxy_type,
proxy,
)
)
count += 1
concurrent.futures.wait(future_list, timeout=180)

Expand All @@ -153,12 +175,12 @@ def split_string(str, delimiter, index):
s, _, t = s.partition(delimiter)
if index == 0:
break
if t == '':
if t == "":
break
index = index - 1
s = t

if s == '':
if s == "":
s = str

return s
147 changes: 111 additions & 36 deletions image_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,39 +11,102 @@
import downloader
import utils


def main(argv):
parser = argparse.ArgumentParser(description="Image Downloader")
parser.add_argument("keywords", type=str,
help='Keywords to search. ("in quotes")')
parser.add_argument("--engine", "-e", type=str, default="Google",
help="Image search engine.", choices=["Google", "Bing", "Baidu"])
parser.add_argument("--driver", "-d", type=str, default="chrome_headless",
help="Image search engine.", choices=["chrome_headless", "chrome", "api"])
parser.add_argument("--max-number", "-n", type=int, default=100,
help="Max number of images download for the keywords.")
parser.add_argument("--num-threads", "-j", type=int, default=50,
help="Number of threads to concurrently download images.")
parser.add_argument("--timeout", "-t", type=int, default=10,
help="Seconds to timeout when download an image.")
parser.add_argument("--output", "-o", type=str, default="./download_images",
help="Output directory to save downloaded images.")
parser.add_argument("--safe-mode", "-S", action="store_true", default=False,
help="Turn on safe search mode. (Only effective in Google)")
parser.add_argument("--face-only", "-F", action="store_true", default=False,
help="Only search for faces (only available in Google)")
parser.add_argument("--proxy_http", "-ph", type=str, default=None,
help="Set http proxy (e.g. 192.168.0.2:8080)")
parser.add_argument("--proxy_socks5", "-ps", type=str, default=None,
help="Set socks5 proxy (e.g. 192.168.0.2:1080)")
parser.add_argument("keywords", type=str, help='Keywords to search. ("in quotes")')
parser.add_argument(
"--engine",
"-e",
type=str,
default="Google",
help="Image search engine.",
choices=["Google", "Bing", "Baidu"],
)
parser.add_argument(
"--driver",
"-d",
type=str,
default="chrome_headless",
help="Image search engine.",
choices=["chrome_headless", "chrome", "api"],
)
parser.add_argument(
"--max-number",
"-n",
type=int,
default=100,
help="Max number of images download for the keywords.",
)
parser.add_argument(
"--num-threads",
"-j",
type=int,
default=50,
help="Number of threads to concurrently download images.",
)
parser.add_argument(
"--timeout",
"-t",
type=int,
default=10,
help="Seconds to timeout when download an image.",
)
parser.add_argument(
"--output",
"-o",
type=str,
default="./download_images",
help="Output directory to save downloaded images.",
)
parser.add_argument(
"--safe-mode",
"-S",
action="store_true",
default=False,
help="Turn on safe search mode. (Only effective in Google)",
)
parser.add_argument(
"--face-only",
"-F",
action="store_true",
default=False,
help="Only search for faces (only available in Google)",
)
parser.add_argument(
"--proxy_http",
"-ph",
type=str,
default=None,
help="Set http proxy (e.g. 192.168.0.2:8080)",
)
parser.add_argument(
"--proxy_socks5",
"-ps",
type=str,
default=None,
help="Set socks5 proxy (e.g. 192.168.0.2:1080)",
)
# type is not supported for Baidu
parser.add_argument("--type", "-ty", type=str, default=None,
help="What kinds of images to download.", choices=["clipart", "linedrawing", "photograph"])
parser.add_argument(
"--type",
"-ty",
type=str,
default=None,
help="What kinds of images to download.",
choices=["clipart", "linedrawing", "photograph"],
)
# Bing: color for colored images, bw for black&white images, other color contains Red, orange, yellow, green
# Teal, Blue, Purple, Pink, Brown, Black, Gray, White
# Baidu: white, bw, black, pink, blue, red, yellow, purple, green, teal, orange, brown
# Google: bw, red, orange, yellow, green, teal, blue, purple, pink, white, gray, black, brown
parser.add_argument("--color", "-cl", type=str, default=None,
help="Specify the color of desired images.")
parser.add_argument(
"--color",
"-cl",
type=str,
default=None,
help="Specify the color of desired images.",
)

args = parser.parse_args(args=argv)

Expand All @@ -60,18 +123,30 @@ def main(argv):
print("Dependencies not resolved, exit.")
return

crawled_urls = crawler.crawl_image_urls(args.keywords,
engine=args.engine, max_number=args.max_number,
face_only=args.face_only, safe_mode=args.safe_mode,
proxy_type=proxy_type, proxy=proxy,
browser=args.driver, image_type=args.type, color=args.color)
downloader.download_images(image_urls=crawled_urls, dst_dir=args.output,
concurrency=args.num_threads, timeout=args.timeout,
proxy_type=proxy_type, proxy=proxy,
file_prefix=args.keywords + "_" + args.engine)
crawled_urls = crawler.crawl_image_urls(
args.keywords,
engine=args.engine,
max_number=args.max_number,
face_only=args.face_only,
safe_mode=args.safe_mode,
proxy_type=proxy_type,
proxy=proxy,
browser=args.driver,
image_type=args.type,
color=args.color,
)
downloader.download_images(
image_urls=crawled_urls,
dst_dir=args.output,
concurrency=args.num_threads,
timeout=args.timeout,
proxy_type=proxy_type,
proxy=proxy,
file_prefix=args.keywords + "_" + args.engine,
)

print("Finished.")


if __name__ == '__main__':
if __name__ == "__main__":
main(sys.argv[1:])