this is a hack
This commit is contained in:
@@ -0,0 +1,152 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
import requests
|
||||
import shutil
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
from creole import creole2html
|
||||
|
||||
from markdownify import markdownify
|
||||
|
||||
|
||||
def get_titles(browser) -> list[str]:
|
||||
browser.get("https://wiki.mumble.info/wiki/Special:AllPages")
|
||||
|
||||
page_list = browser.find_elements(By.CLASS_NAME, "mw-allpages-chunk")
|
||||
if not page_list:
|
||||
raise Exception("ERROR: Could not get page list")
|
||||
|
||||
pages = page_list[0].find_elements(By.TAG_NAME, "li")
|
||||
|
||||
titles = []
|
||||
|
||||
for page in pages:
|
||||
if page.get_property("class") == "allpagesredirect":
|
||||
continue
|
||||
title_element = page.find_elements(By.TAG_NAME, "a")
|
||||
titles.append(title_element[0].get_property("title"))
|
||||
|
||||
return titles
|
||||
|
||||
|
||||
class PageSource:
|
||||
def __init__(self, title: str, browser):
|
||||
self.title = title
|
||||
|
||||
edit_link = f"https://wiki.mumble.info/index.php?title={title}&action=edit"
|
||||
browser.get(edit_link)
|
||||
|
||||
source_textbox = browser.find_element(By.ID, "wpTextbox1")
|
||||
if not source_textbox:
|
||||
raise Exception(f"ERROR: could not find source textbox on page {edit_link}")
|
||||
|
||||
contents = source_textbox.get_attribute("value")
|
||||
if not contents:
|
||||
raise Exception(f"ERROR: could not get page source for page {edit_link}")
|
||||
|
||||
self.contents = contents
|
||||
|
||||
self.image_urls = []
|
||||
|
||||
page_link = f"https://wiki.mumble.info/wiki/{title}"
|
||||
browser.get(page_link)
|
||||
|
||||
images = browser.find_elements(By.CLASS_NAME, "image")
|
||||
for image in images:
|
||||
img_element = image.find_element(By.TAG_NAME, "img")
|
||||
img_url = img_element.get_attribute("src")
|
||||
title = img_element.get_attribute("alt")
|
||||
self.image_urls.append((title, img_url))
|
||||
|
||||
def as_html(self) -> str:
|
||||
html = creole2html(self.contents)
|
||||
return html
|
||||
|
||||
def as_md(self) -> str:
|
||||
md = ""
|
||||
|
||||
find_url = re.compile(r"(?<!\[)\[([^\[]+) (\S+)]")
|
||||
find_page_link = re.compile(r"\[\[(?!File:)([^\|]+)\|([^\|]+)]]")
|
||||
for line in self.contents.split("\n"):
|
||||
|
||||
def convert_url(match_obj):
|
||||
return f" [{match_obj.group(2)}]({match_obj.group(1)})"
|
||||
|
||||
def convert_page_link(match_obj):
|
||||
return f"[{match_obj.group(2)}]({match_obj.group(1)}.md)"
|
||||
|
||||
line = re.sub(find_url, convert_url, line)
|
||||
line = re.sub(find_page_link, convert_page_link, line)
|
||||
|
||||
if "Category:" in line:
|
||||
continue
|
||||
elif "File:" in line:
|
||||
regex = r"\[\[File:([^|]+)(\|[^\|\]]+)*]]"
|
||||
filename = re.search(regex, line)
|
||||
if filename:
|
||||
filename = filename.group(1).replace(" ", "_").lower()
|
||||
else:
|
||||
raise Exception("No filename")
|
||||
md += f""
|
||||
elif "{{" in line and "}}" in line:
|
||||
md += line.replace("{{", "# ").replace("}}", "")
|
||||
elif "===" in line:
|
||||
md += line.replace("===", "###", 1).replace("===", "")
|
||||
elif "==" in line:
|
||||
md += line.replace("==", "##", 1).replace("==", "")
|
||||
elif "br style" in line:
|
||||
continue
|
||||
else:
|
||||
md += line
|
||||
md += "\n"
|
||||
|
||||
return md
|
||||
|
||||
|
||||
def convert_markup_to_html(sources: list[PageSource]):
|
||||
os.makedirs("output", exist_ok=True)
|
||||
for source in sources:
|
||||
filename_base = f"output/{source.title}.test"
|
||||
os.makedirs(os.path.dirname(filename_base), exist_ok=True)
|
||||
with open(f"output/{source.title}.txt", "w") as f:
|
||||
f.write(source.contents)
|
||||
with open(f"output/{source.title}.html", "w") as f:
|
||||
f.write(source.as_html())
|
||||
with open(f"output/{source.title}.md", "w") as f:
|
||||
f.write(source.as_md())
|
||||
|
||||
|
||||
def download_files(sources: list[PageSource]):
|
||||
os.makedirs("output/images", exist_ok=True)
|
||||
for source in sources:
|
||||
for i in source.image_urls:
|
||||
r = requests.get(i[1], stream=True)
|
||||
filename = i[0].replace(" ", "_").lower()
|
||||
if not filename:
|
||||
continue
|
||||
if "url=" in filename:
|
||||
continue
|
||||
if r.status_code == 200:
|
||||
with open(f"output/images/{filename}", "wb") as f:
|
||||
r.raw.decode_content = True
|
||||
shutil.copyfileobj(r.raw, f)
|
||||
|
||||
|
||||
def main():
|
||||
browser = webdriver.Firefox()
|
||||
|
||||
links = get_titles(browser)
|
||||
|
||||
sources = [PageSource(link, browser) for link in links]
|
||||
convert_markup_to_html(sources)
|
||||
|
||||
download_files(sources)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user