157 lines
4.8 KiB
Python
157 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script to download all images from the MTC website and store them locally.
|
|
This makes the application work completely offline.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import requests
|
|
from urllib.parse import urlparse
|
|
import time
|
|
from pathlib import Path
|
|
|
|
def create_images_directory():
|
|
"""Create the static/images directory if it doesn't exist."""
|
|
images_dir = Path("static/images")
|
|
images_dir.mkdir(parents=True, exist_ok=True)
|
|
return images_dir
|
|
|
|
def extract_image_urls_from_markdown():
|
|
"""Extract all image URLs from the markdown file."""
|
|
markdown_file = Path("data/balotario_clase_a_cat_I.md")
|
|
|
|
if not markdown_file.exists():
|
|
print(f"Error: {markdown_file} not found!")
|
|
return []
|
|
|
|
with open(markdown_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Find all image URLs
|
|
pattern = r'!\[\]\((https://sierdgtt\.mtc\.gob\.pe/Content/img-data/img\d+\.jpg)\)'
|
|
urls = re.findall(pattern, content)
|
|
|
|
return list(set(urls)) # Remove duplicates
|
|
|
|
def download_image(url, images_dir, retries=3):
|
|
"""Download a single image with retry logic."""
|
|
try:
|
|
# Extract filename from URL
|
|
filename = os.path.basename(urlparse(url).path)
|
|
filepath = images_dir / filename
|
|
|
|
# Skip if already exists
|
|
if filepath.exists():
|
|
print(f"✓ {filename} already exists")
|
|
return True
|
|
|
|
print(f"📥 Downloading {filename}...")
|
|
|
|
# Download with retries
|
|
for attempt in range(retries):
|
|
try:
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
}
|
|
|
|
response = requests.get(url, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
# Save the image
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
print(f"✅ Downloaded {filename} ({len(response.content)} bytes)")
|
|
return True
|
|
|
|
except requests.RequestException as e:
|
|
print(f"❌ Attempt {attempt + 1} failed for {filename}: {e}")
|
|
if attempt < retries - 1:
|
|
time.sleep(2) # Wait before retry
|
|
|
|
print(f"💥 Failed to download {filename} after {retries} attempts")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"💥 Error downloading {url}: {e}")
|
|
return False
|
|
|
|
def update_markdown_file(images_dir):
|
|
"""Update the markdown file to use local image paths."""
|
|
markdown_file = Path("data/balotario_clase_a_cat_I.md")
|
|
backup_file = Path("data/balotario_clase_a_cat_I.md.backup")
|
|
|
|
# Create backup
|
|
if not backup_file.exists():
|
|
with open(markdown_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
with open(backup_file, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
print(f"📋 Created backup: {backup_file}")
|
|
|
|
# Read current content
|
|
with open(markdown_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Replace URLs with local paths
|
|
pattern = r'!\[\]\(https://sierdgtt\.mtc\.gob\.pe/Content/img-data/(img\d+\.jpg)\)'
|
|
replacement = r''
|
|
|
|
updated_content = re.sub(pattern, replacement, content)
|
|
|
|
# Write updated content
|
|
with open(markdown_file, 'w', encoding='utf-8') as f:
|
|
f.write(updated_content)
|
|
|
|
print("📝 Updated markdown file to use local image paths")
|
|
|
|
def main():
|
|
"""Main function to download all images."""
|
|
print("🚀 Starting image download process...")
|
|
|
|
# Create images directory
|
|
images_dir = create_images_directory()
|
|
print(f"📁 Images will be saved to: {images_dir}")
|
|
|
|
# Extract image URLs
|
|
urls = extract_image_urls_from_markdown()
|
|
print(f"🔍 Found {len(urls)} unique images to download")
|
|
|
|
if not urls:
|
|
print("❌ No image URLs found!")
|
|
return
|
|
|
|
# Download images
|
|
successful = 0
|
|
failed = 0
|
|
|
|
for i, url in enumerate(urls, 1):
|
|
print(f"\n[{i}/{len(urls)}] Processing: {url}")
|
|
|
|
if download_image(url, images_dir):
|
|
successful += 1
|
|
else:
|
|
failed += 1
|
|
|
|
# Small delay to be respectful to the server
|
|
time.sleep(0.5)
|
|
|
|
# Summary
|
|
print(f"\n📊 Download Summary:")
|
|
print(f"✅ Successful: {successful}")
|
|
print(f"❌ Failed: {failed}")
|
|
print(f"📁 Total files: {len(list(images_dir.glob('*.jpg')))}")
|
|
|
|
# Update markdown file
|
|
if successful > 0:
|
|
print(f"\n🔄 Updating markdown file...")
|
|
update_markdown_file(images_dir)
|
|
print(f"✅ Process completed!")
|
|
print(f"💡 You can now run your app completely offline!")
|
|
else:
|
|
print(f"❌ No images were downloaded successfully")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|