-
-
Notifications
You must be signed in to change notification settings - Fork 24
/
urls.py
206 lines (151 loc) · 5.86 KB
/
urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
"""
Links to other external pages --> Do nothing
Links to other internal pages
Translate: Direct links with pages to anchor links
Translate: link+anchor to anchor links
Links to anchors
Translate: from #anchor to #page+anchor
So within a page:
Add a new anchor at the start of the page with a id="#pagename"
id="#anchor" to id="#pagename-anchor"
href="#anchor" to href="#pagename-anchor"
href="a/" to href="#a"
href="a/#anchor" to href="#a-anchor"`
# with use_directory_urls = True
[p.url for p in self.pages]
['', 'z/', 'a/']
# use_directory_urls = False
[p.url for p in self.pages]
['index.html', 'z.html', 'a.html']
"""
import re
import os
import html
def is_external(url):
return url.startswith("http") or url.startswith("www")
def url_to_anchor(url):
"""
Translates an internal URL to an anchor URL
Examples:
/ -> #index
index.html -> #index
page/ -> page
page.html#anchor -> #page-anchor
section/page.html#anchor -> #section-page-anchor
page/ -> #page
page/#anchor-link -> #page-anchor-link
Args:
url (str): value of page.url
"""
pass
def get_page_key(page_url):
"""
Get the page key.
Used to prepend a unique key to internal anchorlinks,
so when we combine all pages into one, we don't get conflicting (duplicate) URLs
Works the same when use_directory_urls is set to true or false in mkdocs.yml
Examples
get_page_key('index.html') --> 'homepage'
get_page_key('/') --> 'homepage'
get_page_key('abc/') --> 'abc'
get_page_key('abc.html') --> 'abc'
Args:
page_url (str): The MkDocs url of the page
"""
if len(page_url) > 0:
page_key = (
page_url.lower().strip().rstrip("/").replace(".html", "").replace("/", "-")
)
else:
page_key = "index"
return page_key
def fix_href_links(page_html, page_key):
"""
Changes internal href HTML links to (anchor) links within the print page
"""
# Loop over href links (example in https://regex101.com/r/rMAHrE/520)
href_regex = re.compile(r"<a\s+([^>]*?\s+)?href=\"(.*?)\"", flags=re.IGNORECASE)
matches = re.finditer(href_regex, page_html)
for m in matches:
url = m.group(2)
url = html.unescape(url)
if is_external(url):
continue
elif url.startswith("#"):
# This is an anchor link within a mkdocs page
url = "#" + page_key + "-" + url[1:]
else:
# This is a link to another mkdocs page
# url 'a/#anchor-link' should become '#a-anchor-link'
url_from_root = os.path.normpath(os.path.join('/',url))
url_paths = url_from_root[1:].split("#")
assert len(url_paths) <= 2
page_url = url_paths[0]
url = '#' + get_page_key(page_url)
if len(url_paths) == 2:
url += "-" + url_paths[1]
# Insert back any HTML between '<a' and 'href=', like "class='id'"
other_html = m.group(1)
if other_html is None:
new_string = '<a href="%s"' % (url)
else:
new_string = '<a %s href="%s"' % (other_html.rstrip(), url)
page_html = page_html.replace(m.group(), new_string)
return page_html
def update_anchor_ids(page_html, page_key):
"""
Changes internal anchors to make sure they are unique within the print page.
For example, changes all instances in pagename.html of id="#anchor" to id="#pagename-anchor"
It does this only for the h1-h6 tags.
"""
# Regex demo / tests: https://regex101.com/r/pE66Kg/1
href_regex = re.compile(
r"\<h[1-6].+id=\"([aA-zZ|0-9|\-|\_|\.|\:]+)\"", flags=re.IGNORECASE
)
matches = re.finditer(href_regex, page_html)
for m in matches:
heading_id = m.group(1)
match_text = m.group()
new_text = match_text.replace(heading_id, page_key + "-" + heading_id)
page_html = page_html.replace(match_text, new_text)
return page_html
def fix_image_src(page_html, page_url, directory_urls):
"""
Update img src path for images displayed in print page.
This is because flattening all pages into 1 print page will break any relative links.
"""
# Loop over all images src attributes
# Example regex https://regex101.com/r/TTRsVW/1
img_regex = re.compile(
r"\<img.+src=\"([aA-zZ|0-9|\-|\_|\.|\:|\/]+)\"", flags=re.IGNORECASE
)
matches = re.finditer(img_regex, page_html)
for m in matches:
img_src = m.group(1)
if is_external(img_src):
continue
img_text = m.group()
new_url = os.path.normpath(os.path.join(os.path.dirname(page_url), img_src))
if directory_urls:
new_url = os.path.join('..',new_url)
new_text = img_text.replace(img_src, new_url)
page_html = page_html.replace(img_text, new_text)
return page_html
def fix_internal_links(page_html, page_url, directory_urls):
"""
Updates links to internal pages to anchor links.
This ensures internal links all point to locations inside the print page.
Args:
page_html (str): HTML of page
page_url (str): URL of the page
directory_urls (bool): Whether the mkdocs sites is using directory urls, see https://www.mkdocs.org/user-guide/configuration/?#use_directory_urls
Returns:
html (str): HTML of part of the print page with working internal links
"""
page_key = get_page_key(page_url)
page_html = fix_href_links(page_html, page_key)
page_html = update_anchor_ids(page_html, page_key)
page_html = fix_image_src(page_html, page_url, directory_urls)
# Finally, wrap the entire page in a section with an anchor ID
page_html = ('<section class="print-page" id="%s">' % page_key) + page_html + "</section>"
return page_html