website/convert.py

#!/usr/bin/env python3

# dark wizardry ahead, proceed with caution.

import feedparser
import markdownify as md
from datetime import datetime
import os
import requests
from bs4 import BeautifulSoup
import re

# Replace 'YOUR_FEED_URL' with your RSS feed URL
feed_url = 'https://it-syndikat.org/feeds/index.rss2'
hugo_content_dir = './content/posts/'  # Default content directory for Hugo posts
images_dir = './static/images/feed_images/'  # Directory to store downloaded images

# Ensure the directories exist
os.makedirs(hugo_content_dir, exist_ok=True)
os.makedirs(images_dir, exist_ok=True)

# Function to sanitize filenames
def sanitize_filename(filename):
    # Remove illegal characters and multiple hyphens, dots
    sanitized = re.sub(r'\ +', '-', filename)            # Replace spaces with hyphens
    sanitized = re.sub(r'[<>:"\'/\\|?*]', '', sanitized) # Remove illegal characters
    sanitized = re.sub(r'-+', '-', sanitized)            # Replace multiple hyphens with a single hyphen
    sanitized = re.sub(r'\.+', '.', sanitized)           # Replace multiple dots with a single dot
    return sanitized

# Function to sanitize titles
def sanitize_title(title):
    sanitized = re.sub(r'[\']', '', title)               # Remove single quotes
    return sanitized

# Function to download an image and return its local path
def download_image(url):
    try:
        #headers = {'User-Agent': 'ItsBot/0.1 (wir@it-syndikat.org)'}
        #response = requests.get(url, headers=headers)
        #response.raise_for_status()

        # Extract filename from URL
        filename = os.path.basename(url)
        file_path = os.path.join(images_dir, filename)

        #if not os.path.exists(file_path):
        #    with open(file_path, 'wb') as f:
        #        f.write(response.content)
        #    print(f"Downloaded image: {filename}")
        #else:
        #    print(f"Image already exists: {filename}")

        return f'/images/feed_images/{filename}'
    except Exception as e:
        print(f"Failed to download image from {url}: {e}")
        return url

# Parse the RSS feed
feed = feedparser.parse(feed_url)

# Loop through each entry in the feed
for entry in feed.entries:
    try:
        title = entry.title.replace("/", "-").replace("\\", "-")  # Clean up titles for filenames
        title = title.replace('"', "'")  # Replace double quotes with single quotes

        sanitized_title = sanitize_title(title)

        date_published = datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %z')
        formatted_date = date_published.strftime('%Y-%m-%d')

        # Extract author name from 'email (name)'
        author_name = entry.author.split('(')[1].split(')')[0].strip() if '(' in entry.author else ''

        # Extract categories/tags
        tags = [category.term for category in entry.tags] if hasattr(entry, 'tags') else []

        # Define the filename and path
        file_name = sanitize_filename(title.lower().replace(' ', '-'))
        file_path = os.path.join(hugo_content_dir, f"{formatted_date}-{file_name}.md")

        if not os.path.exists(file_path):
            # Parse the content with BeautifulSoup to find images
            soup = BeautifulSoup(entry.content[0].value, 'html.parser')

            for img in soup.find_all('img'):
                src_url = img.get('src')
                if src_url:
                    new_src = download_image(src_url)
                    img['src'] = new_src

            # Convert the modified HTML to Markdown
            markdown_content = md.markdownify(str(soup))

            # Create metadata block for Hugo
            hugo_metadata = f"""---
title: '{sanitized_title}'
date: {formatted_date}
author: "{author_name}"
tags: [{', '.join(f"'{tag}'" for tag in tags)}]
---

"""

            # Combine metadata and content
            full_post_content = hugo_metadata + markdown_content

            # Write the post to a Markdown file in your Hugo content directory
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(full_post_content)
            print(f"Created new post: {file_path}")
        else:
            print(f"Post already exists: {file_path}")

    except Exception as e:
        print(f"Failed to process entry '{entry.title}': {e}")
        continue

print("All posts and images have been imported successfully! Time to start writing more content!")
No results found.