Migrating a Ghost RSS Blog into Astro Markdown Content

I had to restart my blog after I lost the content due to data corruption on my backups. I thought that a good way to restore them would be to visit https://web.archive.org and find an older copy of my site. From there, I accessed the RSS feed. Then, I saved it locally. This post will help you migrate from those old files (from a Ghost blog in my case) to an Astro site. All you need is your RSS feed. This guide walks you through converting that feed into clean, front-matter-rich Markdown posts that Astro can use.

However, I was unable to import most of my images, which was a bummer.

✅ Step 1: Export Your RSS Feed

Export your feed as an .xml or .txt file. For example:

CerkitBlogRss_archives.txt

✅ Step 2: Use the Python Script

Install Python dependencies using a virtual environment:

python3 -m venv .venv
source .venv/bin/activate
pip install html2text requests

Here’s the Python script you’ll use:

import os
import re
import html
import requests
import html2text
import xml.etree.ElementTree as ET
from urllib.parse import urlparse
from datetime import datetime
from html.parser import HTMLParser

output_dir = "posts"
image_dir = "images"
os.makedirs(output_dir, exist_ok=True)
os.makedirs(image_dir, exist_ok=True)

rss_file = "CerkitBlogRss_archives.txt"

converter = html2text.HTML2Text()
converter.ignore_links = False
converter.ignore_images = False

namespaces = {
    'content': 'http://purl.org/rss/1.0/modules/content/',
    'dc': 'http://purl.org/dc/elements/1.1/'
}

class HTMLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ' '.join(self.fed).strip()

def strip_html(html_text):
    stripper = HTMLStripper()
    stripper.feed(html_text or "")
    return stripper.get_data()

def slugify(title):
    return re.sub(r'[^a-z0-9]+', '-', title.lower()).strip('-')

def download_image(url):
    try:
        filename = os.path.basename(urlparse(url).path)
        local_path = os.path.join(image_dir, filename)
        if not os.path.isfile(local_path):
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            with open(local_path, 'wb') as f:
                f.write(response.content)
        return f"/{image_dir}/{filename}"
    except Exception:
        return url

def replace_image_links(html_content):
    def repl(match):
        original_url = match.group(1)
        new_url = download_image(original_url)
        return f'src="{new_url}"'
    return re.sub(r'src="([^"]+)"', repl, html_content)

tree = ET.parse(rss_file)
root = tree.getroot()

for item in root.findall("./channel/item"):
    title = item.findtext("title")
    pub_date = item.findtext("pubDate")
    author = item.findtext("dc:creator", namespaces=namespaces)
    content = item.findtext("content:encoded", namespaces=namespaces)
    description = item.findtext("description")
    categories = item.findall("category")

    html_content = html.unescape(content or description or "")
    html_content = replace_image_links(html_content)
    markdown_content = converter.handle(html_content).strip()

    clean_description = strip_html(description or "").replace('"', "'")
    date_obj = datetime.strptime(pub_date, "%a, %d %b %Y %H:%M:%S %Z")
    iso_date = date_obj.isoformat() + "Z"

    tags = [cat.text.strip() for cat in categories]
    slug = slugify(title)
    filename = f"{output_dir}/{slug}.md"

    front_matter = f"""---
title: "{title.replace('"', "'")}"
pubDatetime: {iso_date}
description: "{clean_description}"
author: "{author}"
tags: {tags}
draft: false
---\n"""

    with open(filename, "w", encoding="utf-8") as f:
        f.write(front_matter + "\n" + markdown_content)

print(f"✅ All posts saved to ./{output_dir}/ and images to ./{image_dir}/")

✅ Step 3: Copy Into Astro

Once generated, copy .md files into your Astro content collection, e.g.:

cp posts/*.md src/content/blog/
cp -r images/ public/images/

Done! You’ve now migrated your Ghost posts to Astro with full Markdown and clean YAML front matter.