1
0
Fork 0
website/convert.py
2025-01-18 22:26:45 +01:00

120 lines
4.3 KiB
Python
Executable file

#!/usr/bin/env python3
# dark wizardry ahead, proceed with caution.
import feedparser
import markdownify as md
from datetime import datetime
import os
import requests
from bs4 import BeautifulSoup
import re
# Replace 'YOUR_FEED_URL' with your RSS feed URL
feed_url = 'https://it-syndikat.org/feeds/index.rss2'
hugo_content_dir = './content/posts/' # Default content directory for Hugo posts
images_dir = './static/images/feed_images/' # Directory to store downloaded images
# Ensure the directories exist
os.makedirs(hugo_content_dir, exist_ok=True)
os.makedirs(images_dir, exist_ok=True)
# Function to sanitize filenames
def sanitize_filename(filename):
# Remove illegal characters and multiple hyphens, dots
sanitized = re.sub(r'\ +', '-', filename) # Replace spaces with hyphens
sanitized = re.sub(r'[<>:"\'/\\|?*]', '', sanitized) # Remove illegal characters
sanitized = re.sub(r'-+', '-', sanitized) # Replace multiple hyphens with a single hyphen
sanitized = re.sub(r'\.+', '.', sanitized) # Replace multiple dots with a single dot
return sanitized
# Function to sanitize titles
def sanitize_title(title):
sanitized = re.sub(r'[\']', '', title) # Remove single quotes
return sanitized
# Function to download an image and return its local path
def download_image(url):
try:
#headers = {'User-Agent': 'ItsBot/0.1 (wir@it-syndikat.org)'}
#response = requests.get(url, headers=headers)
#response.raise_for_status()
# Extract filename from URL
filename = os.path.basename(url)
file_path = os.path.join(images_dir, filename)
#if not os.path.exists(file_path):
# with open(file_path, 'wb') as f:
# f.write(response.content)
# print(f"Downloaded image: {filename}")
#else:
# print(f"Image already exists: {filename}")
return f'/images/feed_images/{filename}'
except Exception as e:
print(f"Failed to download image from {url}: {e}")
return url
# Parse the RSS feed
feed = feedparser.parse(feed_url)
# Loop through each entry in the feed
for entry in feed.entries:
try:
title = entry.title.replace("/", "-").replace("\\", "-") # Clean up titles for filenames
title = title.replace('"', "'") # Replace double quotes with single quotes
sanitized_title = sanitize_title(title)
date_published = datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %z')
formatted_date = date_published.strftime('%Y-%m-%d')
# Extract author name from 'email (name)'
author_name = entry.author.split('(')[1].split(')')[0].strip() if '(' in entry.author else ''
# Extract categories/tags
tags = [category.term for category in entry.tags] if hasattr(entry, 'tags') else []
# Define the filename and path
file_name = sanitize_filename(title.lower().replace(' ', '-'))
file_path = os.path.join(hugo_content_dir, f"{formatted_date}-{file_name}.md")
if not os.path.exists(file_path):
# Parse the content with BeautifulSoup to find images
soup = BeautifulSoup(entry.content[0].value, 'html.parser')
for img in soup.find_all('img'):
src_url = img.get('src')
if src_url:
new_src = download_image(src_url)
img['src'] = new_src
# Convert the modified HTML to Markdown
markdown_content = md.markdownify(str(soup))
# Create metadata block for Hugo
hugo_metadata = f"""---
title: '{sanitized_title}'
date: {formatted_date}
author: "{author_name}"
tags: [{', '.join(f"'{tag}'" for tag in tags)}]
---
"""
# Combine metadata and content
full_post_content = hugo_metadata + markdown_content
# Write the post to a Markdown file in your Hugo content directory
with open(file_path, 'w', encoding='utf-8') as f:
f.write(full_post_content)
print(f"Created new post: {file_path}")
else:
print(f"Post already exists: {file_path}")
except Exception as e:
print(f"Failed to process entry '{entry.title}': {e}")
continue
print("All posts and images have been imported successfully! Time to start writing more content!")