#!/usr/bin/env python3 """ Script to download all images from the MTC website and store them locally. This makes the application work completely offline. """ import os import re import requests from urllib.parse import urlparse import time from pathlib import Path def create_images_directory(): """Create the static/images directory if it doesn't exist.""" images_dir = Path("static/images") images_dir.mkdir(parents=True, exist_ok=True) return images_dir def extract_image_urls_from_markdown(): """Extract all image URLs from the markdown file.""" markdown_file = Path("data/balotario_clase_a_cat_I.md") if not markdown_file.exists(): print(f"Error: {markdown_file} not found!") return [] with open(markdown_file, 'r', encoding='utf-8') as f: content = f.read() # Find all image URLs pattern = r'!\[\]\((https://sierdgtt\.mtc\.gob\.pe/Content/img-data/img\d+\.jpg)\)' urls = re.findall(pattern, content) return list(set(urls)) # Remove duplicates def download_image(url, images_dir, retries=3): """Download a single image with retry logic.""" try: # Extract filename from URL filename = os.path.basename(urlparse(url).path) filepath = images_dir / filename # Skip if already exists if filepath.exists(): print(f"āœ“ {filename} already exists") return True print(f"šŸ“„ Downloading {filename}...") # Download with retries for attempt in range(retries): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() # Save the image with open(filepath, 'wb') as f: f.write(response.content) print(f"āœ… Downloaded {filename} ({len(response.content)} bytes)") return True except requests.RequestException as e: print(f"āŒ Attempt {attempt + 1} failed for {filename}: {e}") if attempt < retries - 1: time.sleep(2) # Wait before retry print(f"šŸ’„ Failed to download {filename} after {retries} attempts") return False except Exception as e: print(f"šŸ’„ Error downloading {url}: {e}") return False def update_markdown_file(images_dir): """Update the markdown file to use local image paths.""" markdown_file = Path("data/balotario_clase_a_cat_I.md") backup_file = Path("data/balotario_clase_a_cat_I.md.backup") # Create backup if not backup_file.exists(): with open(markdown_file, 'r', encoding='utf-8') as f: content = f.read() with open(backup_file, 'w', encoding='utf-8') as f: f.write(content) print(f"šŸ“‹ Created backup: {backup_file}") # Read current content with open(markdown_file, 'r', encoding='utf-8') as f: content = f.read() # Replace URLs with local paths pattern = r'!\[\]\(https://sierdgtt\.mtc\.gob\.pe/Content/img-data/(img\d+\.jpg)\)' replacement = r'![](/static/images/\1)' updated_content = re.sub(pattern, replacement, content) # Write updated content with open(markdown_file, 'w', encoding='utf-8') as f: f.write(updated_content) print("šŸ“ Updated markdown file to use local image paths") def main(): """Main function to download all images.""" print("šŸš€ Starting image download process...") # Create images directory images_dir = create_images_directory() print(f"šŸ“ Images will be saved to: {images_dir}") # Extract image URLs urls = extract_image_urls_from_markdown() print(f"šŸ” Found {len(urls)} unique images to download") if not urls: print("āŒ No image URLs found!") return # Download images successful = 0 failed = 0 for i, url in enumerate(urls, 1): print(f"\n[{i}/{len(urls)}] Processing: {url}") if download_image(url, images_dir): successful += 1 else: failed += 1 # Small delay to be respectful to the server time.sleep(0.5) # Summary print(f"\nšŸ“Š Download Summary:") print(f"āœ… Successful: {successful}") print(f"āŒ Failed: {failed}") print(f"šŸ“ Total files: {len(list(images_dir.glob('*.jpg')))}") # Update markdown file if successful > 0: print(f"\nšŸ”„ Updating markdown file...") update_markdown_file(images_dir) print(f"āœ… Process completed!") print(f"šŸ’” You can now run your app completely offline!") else: print(f"āŒ No images were downloaded successfully") if __name__ == "__main__": main()