initial commit
This commit is contained in:
156
scripts/download_images.py
Normal file
156
scripts/download_images.py
Normal file
@@ -0,0 +1,156 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to download all images from the MTC website and store them locally.
|
||||
This makes the application work completely offline.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import requests
|
||||
from urllib.parse import urlparse
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
def create_images_directory():
|
||||
"""Create the static/images directory if it doesn't exist."""
|
||||
images_dir = Path("static/images")
|
||||
images_dir.mkdir(parents=True, exist_ok=True)
|
||||
return images_dir
|
||||
|
||||
def extract_image_urls_from_markdown():
|
||||
"""Extract all image URLs from the markdown file."""
|
||||
markdown_file = Path("data/balotario_clase_a_cat_I.md")
|
||||
|
||||
if not markdown_file.exists():
|
||||
print(f"Error: {markdown_file} not found!")
|
||||
return []
|
||||
|
||||
with open(markdown_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Find all image URLs
|
||||
pattern = r'!\[\]\((https://sierdgtt\.mtc\.gob\.pe/Content/img-data/img\d+\.jpg)\)'
|
||||
urls = re.findall(pattern, content)
|
||||
|
||||
return list(set(urls)) # Remove duplicates
|
||||
|
||||
def download_image(url, images_dir, retries=3):
|
||||
"""Download a single image with retry logic."""
|
||||
try:
|
||||
# Extract filename from URL
|
||||
filename = os.path.basename(urlparse(url).path)
|
||||
filepath = images_dir / filename
|
||||
|
||||
# Skip if already exists
|
||||
if filepath.exists():
|
||||
print(f"✓ {filename} already exists")
|
||||
return True
|
||||
|
||||
print(f"📥 Downloading {filename}...")
|
||||
|
||||
# Download with retries
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
|
||||
response = requests.get(url, headers=headers, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
# Save the image
|
||||
with open(filepath, 'wb') as f:
|
||||
f.write(response.content)
|
||||
|
||||
print(f"✅ Downloaded {filename} ({len(response.content)} bytes)")
|
||||
return True
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ Attempt {attempt + 1} failed for {filename}: {e}")
|
||||
if attempt < retries - 1:
|
||||
time.sleep(2) # Wait before retry
|
||||
|
||||
print(f"💥 Failed to download {filename} after {retries} attempts")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"💥 Error downloading {url}: {e}")
|
||||
return False
|
||||
|
||||
def update_markdown_file(images_dir):
|
||||
"""Update the markdown file to use local image paths."""
|
||||
markdown_file = Path("data/balotario_clase_a_cat_I.md")
|
||||
backup_file = Path("data/balotario_clase_a_cat_I.md.backup")
|
||||
|
||||
# Create backup
|
||||
if not backup_file.exists():
|
||||
with open(markdown_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
with open(backup_file, 'w', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
print(f"📋 Created backup: {backup_file}")
|
||||
|
||||
# Read current content
|
||||
with open(markdown_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Replace URLs with local paths
|
||||
pattern = r'!\[\]\(https://sierdgtt\.mtc\.gob\.pe/Content/img-data/(img\d+\.jpg)\)'
|
||||
replacement = r''
|
||||
|
||||
updated_content = re.sub(pattern, replacement, content)
|
||||
|
||||
# Write updated content
|
||||
with open(markdown_file, 'w', encoding='utf-8') as f:
|
||||
f.write(updated_content)
|
||||
|
||||
print("📝 Updated markdown file to use local image paths")
|
||||
|
||||
def main():
|
||||
"""Main function to download all images."""
|
||||
print("🚀 Starting image download process...")
|
||||
|
||||
# Create images directory
|
||||
images_dir = create_images_directory()
|
||||
print(f"📁 Images will be saved to: {images_dir}")
|
||||
|
||||
# Extract image URLs
|
||||
urls = extract_image_urls_from_markdown()
|
||||
print(f"🔍 Found {len(urls)} unique images to download")
|
||||
|
||||
if not urls:
|
||||
print("❌ No image URLs found!")
|
||||
return
|
||||
|
||||
# Download images
|
||||
successful = 0
|
||||
failed = 0
|
||||
|
||||
for i, url in enumerate(urls, 1):
|
||||
print(f"\n[{i}/{len(urls)}] Processing: {url}")
|
||||
|
||||
if download_image(url, images_dir):
|
||||
successful += 1
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
# Small delay to be respectful to the server
|
||||
time.sleep(0.5)
|
||||
|
||||
# Summary
|
||||
print(f"\n📊 Download Summary:")
|
||||
print(f"✅ Successful: {successful}")
|
||||
print(f"❌ Failed: {failed}")
|
||||
print(f"📁 Total files: {len(list(images_dir.glob('*.jpg')))}")
|
||||
|
||||
# Update markdown file
|
||||
if successful > 0:
|
||||
print(f"\n🔄 Updating markdown file...")
|
||||
update_markdown_file(images_dir)
|
||||
print(f"✅ Process completed!")
|
||||
print(f"💡 You can now run your app completely offline!")
|
||||
else:
|
||||
print(f"❌ No images were downloaded successfully")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user