You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
162 lines
6.1 KiB
162 lines
6.1 KiB
"""Convert a Wordpress XML dump into to a (mostly working) pixywerk2 tree."""
|
|
|
|
import argparse
|
|
import datetime
|
|
import json
|
|
import os
|
|
import sys
|
|
from urllib.parse import urlparse
|
|
from xml.etree.ElementTree import ElementTree
|
|
|
|
import requests
|
|
|
|
FILE_PATTERN = "{postdate}-{postname}.thtml"
|
|
|
|
|
|
def parse_args(args):
|
|
parser = argparse.ArgumentParser("importwp.py")
|
|
|
|
parser.add_argument("input", help="The input file.")
|
|
parser.add_argument("out_dir", help="Output root directory.", default='.')
|
|
parser.add_argument("--fetch-attachments", help="Fetch all attachments referred to in file.", action="store_true", dest='fetch_attachments')
|
|
parser.add_argument("--attachment-dir", help="Subdirectory to place attachments in.", default="attachments", dest='attachment_dir')
|
|
parser.add_argument("--post-dir", help="Subdirectory to place posts in.", default="posts", dest='post_dir')
|
|
parser.add_argument("--page-dir", help="Subdirectory to place pages in.", default="", dest='page_dir')
|
|
|
|
result = parser.parse_args(args)
|
|
result.post_dir = os.path.join(result.out_dir, result.post_dir)
|
|
result.page_dir = os.path.join(result.out_dir, result.page_dir)
|
|
result.attachment_dir = os.path.join(result.out_dir, result.attachment_dir)
|
|
|
|
return result
|
|
|
|
|
|
def parse_input(xmlpath):
|
|
tree = ElementTree()
|
|
|
|
tree_root = tree.parse(source=xmlpath)
|
|
posts = {}
|
|
attachments = {}
|
|
pages = {}
|
|
|
|
for node in tree_root.find("channel"):
|
|
if node.tag == "item":
|
|
post_type = node.find("{http://wordpress.org/export/1.2/}post_type")
|
|
if post_type is not None:
|
|
status = node.find("{http://wordpress.org/export/1.2/}status")
|
|
if status is not None and status.text == "draft":
|
|
continue
|
|
content = node.find("{http://purl.org/rss/1.0/modules/content/}encoded")
|
|
title = node.find("title")
|
|
pubdate = node.find("pubDate")
|
|
description = node.find("description")
|
|
post_name = node.find("{http://wordpress.org/export/1.2/}post_name")
|
|
categories = node.findall("category")
|
|
post_id = node.find("{http://wordpress.org/export/1.2/}post_id")
|
|
post_parent = node.find("{http://wordpress.org/export/1.2/}post_parent")
|
|
if post_type.text == "post":
|
|
# found a post!
|
|
posts[post_id.text] = {'content':content,
|
|
'title':title,
|
|
'pubdate':pubdate,
|
|
'description':description,
|
|
'post_name':post_name,
|
|
'categories':categories,
|
|
'post_parent':post_parent}
|
|
elif post_type.text == "attachment":
|
|
# attachment
|
|
att_url = node.find("{http://wordpress.org/export/1.2/}attachment_url")
|
|
|
|
attachments[post_id.text] = {'content':content,
|
|
'title':title,
|
|
'pubdate':pubdate,
|
|
'description':description,
|
|
'post_name':post_name,
|
|
'categories':categories,
|
|
'post_parent':post_parent,
|
|
'att_url':att_url,}
|
|
elif post_type.text == "page":
|
|
pages[post_id.text] = {'content':content,
|
|
'title':title,
|
|
'pubdate':pubdate,
|
|
'description':description,
|
|
'post_name':post_name,
|
|
'categories':categories,
|
|
'post_parent':post_parent}
|
|
|
|
return posts, attachments, pages
|
|
|
|
def fetch_attachment(attch, outdir):
|
|
url = attch['att_url'].text
|
|
p = urlparse(url)
|
|
filename = os.path.join(outdir, os.path.split(p.path)[-1])
|
|
print("fetching attachment",url,"->",filename)
|
|
r = requests.get(url)
|
|
with open(filename, 'wb') as outf:
|
|
outf.write(r.content)
|
|
|
|
def save_cont(post, outdir):
|
|
dt = datetime.datetime.strptime(post['pubdate'].text, "%a, %d %b %Y %H:%M:%S %z")
|
|
postdate = dt.strftime("%Y-%m-%d-%H%M%S")
|
|
filename = FILE_PATTERN.format(postdate=postdate, postname=post['post_name'].text)
|
|
print(post['title'].text, "->", filename)
|
|
with open(os.path.join(outdir, filename), "w") as outf:
|
|
outf.write(post['content'].text)
|
|
# handle attachments
|
|
|
|
tags = []
|
|
category = ""
|
|
for tg in post['categories']:
|
|
if "domain" in tg.attrib and tg.attrib["domain"] == "category":
|
|
category = tg.text
|
|
else:
|
|
tags.append(tg.text)
|
|
|
|
with open(os.path.join(outdir, filename + ".meta"), "w") as outf:
|
|
metadata = {
|
|
"title": post['title'].text,
|
|
"description": post['description'].text,
|
|
"post_time": dt.timestamp(),
|
|
"featured": "",
|
|
"tags": tags,
|
|
"category": category,
|
|
}
|
|
json.dump(metadata, outf)
|
|
|
|
|
|
def main():
|
|
args = parse_args(sys.argv[1:])
|
|
try:
|
|
os.mkdir(args.out_dir)
|
|
except FileExistsError:
|
|
pass
|
|
|
|
try:
|
|
os.mkdir(args.page_dir)
|
|
except FileExistsError:
|
|
pass
|
|
|
|
try:
|
|
os.mkdir(args.post_dir)
|
|
except FileExistsError:
|
|
pass
|
|
|
|
if args.fetch_attachments:
|
|
try:
|
|
os.mkdir(args.attachment_dir)
|
|
except FileExistsError:
|
|
pass
|
|
|
|
posts, attachments, pages = parse_input(args.input)
|
|
|
|
if args.fetch_attachments:
|
|
[fetch_attachment(post, args.attachment_dir) for post in attachments.values()]
|
|
|
|
[save_cont(post, args.post_dir) for post in posts.values()]
|
|
[save_cont(page, args.page_dir) for page in pages.values()]
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|
|
|