-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathconvert.py
65 lines (54 loc) · 2.09 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Install pandoc (http://johnmacfarlane.net/pandoc/)
# pip install PyYAML
import subprocess
import yaml
import urllib.request
import os
import sys
from functools import reduce
github_user = 'JetBrains' if len(sys.argv) == 1 else sys.argv[1]
base_url = "https://raw.githubusercontent.com/{}/kotlin-web-site/master".format(github_user)
navigation_url = "{}/data/_nav.yml".format(base_url)
reference_url = "{}/pages".format(base_url)
tmp_path = '/tmp/kotlin-one-epub/'
pandoc_extensions = [
"+pipe_tables", "+backtick_code_blocks", "+yaml_metadata_block", "+inline_code_attributes"
]
print("Fetching navigation...")
response = urllib.request.urlopen(navigation_url).read()
print("Parsing navigation...")
navigation = yaml.safe_load(response)
reference = navigation['reference']['content']
# Exclude 'Reference' and 'Core Libraries' section
excludes = ['Reference', 'Core Libraries', "What's New", 'Releases and Roadmap']
# Access content
content = [r for r in reference if r['title'] not in excludes]
content = [r['content'] for r in content if r.__contains__('content')]
# Flatmap list of lists
content = reduce(list.__add__, content)
# Extract first key of dictionary
urls = [c['url'] for c in content]
# Add base url and use markdown file
urls = [reference_url + u.replace('.html', '.md', 1) for u in urls]
# Download the pages
if not os.path.exists(tmp_path):
os.makedirs(tmp_path)
tmp_files = []
for i, url in enumerate(urls):
print("Downloading " + url + "...")
print(reference_url + url)
url_content = urllib.request.urlopen(url).read()
# Remove strange words from the content
url_content = url_content.decode().replace("{: .keyword }", "")
filename = tmp_path + str(i) + ".md"
tmp_file = open(filename, 'w')
tmp_file.write(url_content)
tmp_file.close()
tmp_files.append(filename)
# Run pandoc
print("Running pandoc...")
joined_files = ' '.join(tmp_files)
command = "pandoc -s --toc --from=markdown" + ''.join(pandoc_extensions) + " --to=epub3 --output=kotlin.epub title.md " + joined_files
p = subprocess.Popen(command, shell=True)
p.wait()
print("Done. Check kotlin.epub!")