#!/usr/bin/env python3 # dark wizardry ahead, proceed with caution. import feedparser import markdownify as md from datetime import datetime import os import requests from bs4 import BeautifulSoup import re # Replace 'YOUR_FEED_URL' with your RSS feed URL feed_url = 'https://it-syndikat.org/feeds/index.rss2' hugo_content_dir = './content/posts/' # Default content directory for Hugo posts images_dir = './static/images/feed_images/' # Directory to store downloaded images # Ensure the directories exist os.makedirs(hugo_content_dir, exist_ok=True) os.makedirs(images_dir, exist_ok=True) # Function to sanitize filenames def sanitize_filename(filename): # Remove illegal characters and multiple hyphens, dots sanitized = re.sub(r'\ +', '-', filename) # Replace spaces with hyphens sanitized = re.sub(r'[<>:"\'/\\|?*]', '', sanitized) # Remove illegal characters sanitized = re.sub(r'-+', '-', sanitized) # Replace multiple hyphens with a single hyphen sanitized = re.sub(r'\.+', '.', sanitized) # Replace multiple dots with a single dot return sanitized # Function to sanitize titles def sanitize_title(title): sanitized = re.sub(r'[\']', '', title) # Remove single quotes return sanitized # Function to download an image and return its local path def download_image(url): try: #headers = {'User-Agent': 'ItsBot/0.1 (wir@it-syndikat.org)'} #response = requests.get(url, headers=headers) #response.raise_for_status() # Extract filename from URL filename = os.path.basename(url) file_path = os.path.join(images_dir, filename) #if not os.path.exists(file_path): # with open(file_path, 'wb') as f: # f.write(response.content) # print(f"Downloaded image: {filename}") #else: # print(f"Image already exists: {filename}") return f'/images/feed_images/{filename}' except Exception as e: print(f"Failed to download image from {url}: {e}") return url # Parse the RSS feed feed = feedparser.parse(feed_url) # Loop through each entry in the feed for entry in feed.entries: try: title = entry.title.replace("/", "-").replace("\\", "-") # Clean up titles for filenames title = title.replace('"', "'") # Replace double quotes with single quotes sanitized_title = sanitize_title(title) date_published = datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %z') formatted_date = date_published.strftime('%Y-%m-%d') # Extract author name from 'email (name)' author_name = entry.author.split('(')[1].split(')')[0].strip() if '(' in entry.author else '' # Extract categories/tags tags = [category.term for category in entry.tags] if hasattr(entry, 'tags') else [] # Define the filename and path file_name = sanitize_filename(title.lower().replace(' ', '-')) file_path = os.path.join(hugo_content_dir, f"{formatted_date}-{file_name}.md") if not os.path.exists(file_path): # Parse the content with BeautifulSoup to find images soup = BeautifulSoup(entry.content[0].value, 'html.parser') for img in soup.find_all('img'): src_url = img.get('src') if src_url: new_src = download_image(src_url) img['src'] = new_src # Convert the modified HTML to Markdown markdown_content = md.markdownify(str(soup)) # Create metadata block for Hugo hugo_metadata = f"""--- title: '{sanitized_title}' date: {formatted_date} author: "{author_name}" tags: [{', '.join(f"'{tag}'" for tag in tags)}] --- """ # Combine metadata and content full_post_content = hugo_metadata + markdown_content # Write the post to a Markdown file in your Hugo content directory with open(file_path, 'w', encoding='utf-8') as f: f.write(full_post_content) print(f"Created new post: {file_path}") else: print(f"Post already exists: {file_path}") except Exception as e: print(f"Failed to process entry '{entry.title}': {e}") continue print("All posts and images have been imported successfully! Time to start writing more content!")