[1/2] incubator-beam-site git commit: Add tool to fix links.

davor Tue, 08 Nov 2016 15:45:06 -0800

Repository: incubator-beam-site
Updated Branches:
  refs/heads/asf-site 268cadca4 -> 81bb48952



Add tool to fix links.

Signed-off-by: Jason Kuster <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/incubator-beam-site/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-beam-site/commit/e5828ee4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-beam-site/tree/e5828ee4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-beam-site/diff/e5828ee4

Branch: refs/heads/asf-site
Commit: e5828ee4a886bf02dda0099c3c60e15ac429ece3
Parents: 268cadc
Author: Jason Kuster <[email protected]>
Authored: Tue Nov 8 14:52:06 2016 -0800
Committer: Davor Bonaci <[email protected]>
Committed: Tue Nov 8 15:43:58 2016 -0800

----------------------------------------------------------------------
 tools/append_index_html_to_internal_links.py | 76 +++++++++++++++++++++++
 1 file changed, 76 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-beam-site/blob/e5828ee4/tools/append_index_html_to_internal_links.py
----------------------------------------------------------------------
diff --git a/tools/append_index_html_to_internal_links.py 
b/tools/append_index_html_to_internal_links.py
new file mode 100644
index 0000000..da87f57
--- /dev/null
+++ b/tools/append_index_html_to_internal_links.py
@@ -0,0 +1,76 @@
+"""Script to fix the links in the staged website.
+Finds all internal links which do not have index.html at the end and appends
+index.html in the appropriate place (preserving anchors, etc).
+
+Usage:
+  From root directory, after running the jekyll build, execute
+  'python tools/append_index_html_to_internal_links.py'.
+
+Dependencies:
+  beautifulsoup4
+  Installable via pip as 'sudo pip install beautifulsoup4' or apt via
+  'sudo apt-get install python-beautifulsoup4'.
+
+"""
+
+import fnmatch
+import os
+import re
+from bs4 import BeautifulSoup
+
+# Original link match. Matches any string which starts with '/' and doesn't
+# have a file extension.
+linkMatch = r'^\/(.*\.(?!([^\/]+)$))?[^.]*$'
+
+# Regex which matches strings of type /internal/link/#anchor. Breaks into two
+# groups for ease of inserting 'index.html'.
+anchorMatch1 = r'(.+\/)(#[^\/]+$)'
+
+# Regex which matches strings of type /internal/link#anchor. Breaks into two
+# groups for ease of inserting 'index.html'.
+anchorMatch2 = r'(.+\/[a-zA-Z0-9]+)(#[^\/]+$)'
+
+
+matches = []
+# Recursively walk content directory and find all html files.
+for root, dirnames, filenames in os.walk('content'):
+  for filename in fnmatch.filter(filenames, '*.html'):
+    # Javadoc does not have the index.html problem, so omit it.
+    if 'javadoc' not in root:
+      matches.append(os.path.join(root, filename))
+
+print 'Matches: ' + str(len(matches))
+# Iterates over each matched file looking for link matches.
+for match in matches:
+  print 'Fixing links in: ' + match
+  mf = open(match)
+  soup = BeautifulSoup(mf, "lxml")
+  # Iterates over every <a>
+  for a in soup.findAll('a'):
+    try:
+      hr = a['href']
+      if re.match(linkMatch, hr) is not None:
+        if hr.endswith('/'):
+          # /internal/link/
+          a['href'] = hr + 'index.html'
+        elif re.match(anchorMatch1, hr) is not None:
+          # /internal/link/#anchor
+          mat = re.match(anchorMatch1, hr)
+          a['href'] = mat.group(1) + 'index.html' + mat.group(2)
+        elif re.match(anchorMatch2, hr) is not None:
+          # /internal/link#anchor
+          mat = re.match(anchorMatch2, hr)
+          a['href'] = mat.group(1) + '/index.html' + mat.group(2)
+        else:
+          # /internal/link
+          a['href'] = hr + '/index.html'
+        mf.close()
+
+        html = soup.prettify("utf-8")
+        # Write back to the file.
+        with open(match, "wb") as f:
+          print 'Replacing ' + hr + ' with: ' + a['href']
+          f.write(html)
+    except KeyError as e:
+      # Some <a> tags don't have an href.
+      continue

[1/2] incubator-beam-site git commit: Add tool to fix links.

Reply via email to