forked from IT-Syndikat/website
120 lines
4.3 KiB
Python
Executable file
120 lines
4.3 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
|
|
# dark wizardry ahead, proceed with caution.
|
|
|
|
import feedparser
|
|
import markdownify as md
|
|
from datetime import datetime
|
|
import os
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import re
|
|
|
|
# Replace 'YOUR_FEED_URL' with your RSS feed URL
|
|
feed_url = 'https://it-syndikat.org/feeds/index.rss2'
|
|
hugo_content_dir = './content/posts/' # Default content directory for Hugo posts
|
|
images_dir = './static/images/feed_images/' # Directory to store downloaded images
|
|
|
|
# Ensure the directories exist
|
|
os.makedirs(hugo_content_dir, exist_ok=True)
|
|
os.makedirs(images_dir, exist_ok=True)
|
|
|
|
# Function to sanitize filenames
|
|
def sanitize_filename(filename):
|
|
# Remove illegal characters and multiple hyphens, dots
|
|
sanitized = re.sub(r'\ +', '-', filename) # Replace spaces with hyphens
|
|
sanitized = re.sub(r'[<>:"\'/\\|?*]', '', sanitized) # Remove illegal characters
|
|
sanitized = re.sub(r'-+', '-', sanitized) # Replace multiple hyphens with a single hyphen
|
|
sanitized = re.sub(r'\.+', '.', sanitized) # Replace multiple dots with a single dot
|
|
return sanitized
|
|
|
|
# Function to sanitize titles
|
|
def sanitize_title(title):
|
|
sanitized = re.sub(r'[\']', '', title) # Remove single quotes
|
|
return sanitized
|
|
|
|
# Function to download an image and return its local path
|
|
def download_image(url):
|
|
try:
|
|
#headers = {'User-Agent': 'ItsBot/0.1 (wir@it-syndikat.org)'}
|
|
#response = requests.get(url, headers=headers)
|
|
#response.raise_for_status()
|
|
|
|
# Extract filename from URL
|
|
filename = os.path.basename(url)
|
|
file_path = os.path.join(images_dir, filename)
|
|
|
|
#if not os.path.exists(file_path):
|
|
# with open(file_path, 'wb') as f:
|
|
# f.write(response.content)
|
|
# print(f"Downloaded image: {filename}")
|
|
#else:
|
|
# print(f"Image already exists: {filename}")
|
|
|
|
return f'/images/feed_images/{filename}'
|
|
except Exception as e:
|
|
print(f"Failed to download image from {url}: {e}")
|
|
return url
|
|
|
|
# Parse the RSS feed
|
|
feed = feedparser.parse(feed_url)
|
|
|
|
# Loop through each entry in the feed
|
|
for entry in feed.entries:
|
|
try:
|
|
title = entry.title.replace("/", "-").replace("\\", "-") # Clean up titles for filenames
|
|
title = title.replace('"', "'") # Replace double quotes with single quotes
|
|
|
|
sanitized_title = sanitize_title(title)
|
|
|
|
date_published = datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %z')
|
|
formatted_date = date_published.strftime('%Y-%m-%d')
|
|
|
|
# Extract author name from 'email (name)'
|
|
author_name = entry.author.split('(')[1].split(')')[0].strip() if '(' in entry.author else ''
|
|
|
|
# Extract categories/tags
|
|
tags = [category.term for category in entry.tags] if hasattr(entry, 'tags') else []
|
|
|
|
# Define the filename and path
|
|
file_name = sanitize_filename(title.lower().replace(' ', '-'))
|
|
file_path = os.path.join(hugo_content_dir, f"{formatted_date}-{file_name}.md")
|
|
|
|
if not os.path.exists(file_path):
|
|
# Parse the content with BeautifulSoup to find images
|
|
soup = BeautifulSoup(entry.content[0].value, 'html.parser')
|
|
|
|
for img in soup.find_all('img'):
|
|
src_url = img.get('src')
|
|
if src_url:
|
|
new_src = download_image(src_url)
|
|
img['src'] = new_src
|
|
|
|
# Convert the modified HTML to Markdown
|
|
markdown_content = md.markdownify(str(soup))
|
|
|
|
# Create metadata block for Hugo
|
|
hugo_metadata = f"""---
|
|
title: '{sanitized_title}'
|
|
date: {formatted_date}
|
|
author: "{author_name}"
|
|
tags: [{', '.join(f"'{tag}'" for tag in tags)}]
|
|
---
|
|
|
|
"""
|
|
|
|
# Combine metadata and content
|
|
full_post_content = hugo_metadata + markdown_content
|
|
|
|
# Write the post to a Markdown file in your Hugo content directory
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
f.write(full_post_content)
|
|
print(f"Created new post: {file_path}")
|
|
else:
|
|
print(f"Post already exists: {file_path}")
|
|
|
|
except Exception as e:
|
|
print(f"Failed to process entry '{entry.title}': {e}")
|
|
continue
|
|
|
|
print("All posts and images have been imported successfully! Time to start writing more content!")
|