I had to restart my blog after I lost the content due to data corruption on my backups. I thought that a good way to restore them would be to visit https://web.archive.org and find an older copy of my site. From there, I accessed the RSS feed. Then, I saved it locally. This post will help you migrate from those old files (from a Ghost blog in my case) to an Astro site. All you need is your RSS feed. This guide walks you through converting that feed into clean, front-matter-rich Markdown posts that Astro can use.
However, I was unable to import most of my images, which was a bummer.
✅ Step 1: Export Your RSS Feed
Export your feed as an .xml
or .txt
file. For example:
CerkitBlogRss_archives.txt
✅ Step 2: Use the Python Script
Install Python dependencies using a virtual environment:
python3 -m venv .venv
source .venv/bin/activate
pip install html2text requests
Here’s the Python script you’ll use:
import os
import re
import html
import requests
import html2text
import xml.etree.ElementTree as ET
from urllib.parse import urlparse
from datetime import datetime
from html.parser import HTMLParser
output_dir = "posts"
image_dir = "images"
os.makedirs(output_dir, exist_ok=True)
os.makedirs(image_dir, exist_ok=True)
rss_file = "CerkitBlogRss_archives.txt"
converter = html2text.HTML2Text()
converter.ignore_links = False
converter.ignore_images = False
namespaces = {
'content': 'http://purl.org/rss/1.0/modules/content/',
'dc': 'http://purl.org/dc/elements/1.1/'
}
class HTMLStripper(HTMLParser):
def __init__(self):
super().__init__()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ' '.join(self.fed).strip()
def strip_html(html_text):
stripper = HTMLStripper()
stripper.feed(html_text or "")
return stripper.get_data()
def slugify(title):
return re.sub(r'[^a-z0-9]+', '-', title.lower()).strip('-')
def download_image(url):
try:
filename = os.path.basename(urlparse(url).path)
local_path = os.path.join(image_dir, filename)
if not os.path.isfile(local_path):
response = requests.get(url, timeout=10)
response.raise_for_status()
with open(local_path, 'wb') as f:
f.write(response.content)
return f"/{image_dir}/{filename}"
except Exception:
return url
def replace_image_links(html_content):
def repl(match):
original_url = match.group(1)
new_url = download_image(original_url)
return f'src="{new_url}"'
return re.sub(r'src="([^"]+)"', repl, html_content)
tree = ET.parse(rss_file)
root = tree.getroot()
for item in root.findall("./channel/item"):
title = item.findtext("title")
pub_date = item.findtext("pubDate")
author = item.findtext("dc:creator", namespaces=namespaces)
content = item.findtext("content:encoded", namespaces=namespaces)
description = item.findtext("description")
categories = item.findall("category")
html_content = html.unescape(content or description or "")
html_content = replace_image_links(html_content)
markdown_content = converter.handle(html_content).strip()
clean_description = strip_html(description or "").replace('"', "'")
date_obj = datetime.strptime(pub_date, "%a, %d %b %Y %H:%M:%S %Z")
iso_date = date_obj.isoformat() + "Z"
tags = [cat.text.strip() for cat in categories]
slug = slugify(title)
filename = f"{output_dir}/{slug}.md"
front_matter = f"""---
title: "{title.replace('"', "'")}"
pubDatetime: {iso_date}
description: "{clean_description}"
author: "{author}"
tags: {tags}
draft: false
---\n"""
with open(filename, "w", encoding="utf-8") as f:
f.write(front_matter + "\n" + markdown_content)
print(f"✅ All posts saved to ./{output_dir}/ and images to ./{image_dir}/")
✅ Step 3: Copy Into Astro
Once generated, copy .md
files into your Astro content collection, e.g.:
cp posts/*.md src/content/blog/
cp -r images/ public/images/
Done! You’ve now migrated your Ghost posts to Astro with full Markdown and clean YAML front matter.