Generated by Claude Sonnet 4
PYTHON
#!/usr/bin/env python3
"""
Wikimedia图片下载脚本
从test.py文件中提取所有Wikimedia链接并下载图片到本地
使用HTTPS代理:127.0.0.1:7890
"""
import os
import re
import requests
import urllib.parse
from pathlib import Path
import time
def extract_urls_from_file(filename):
"""从文件中提取所有Wikimedia URL"""
try:
with open(filename, 'r', encoding='utf-8') as f:
content = f.read()
# 使用正则表达式提取所有https://upload.wikimedia.org链接
url_pattern = r'https://upload\.wikimedia\.org/[^\s\n]+'
urls = re.findall(url_pattern, content)
return urls
except FileNotFoundError:
print(f"错误:找不到文件 {filename}")
return []
except Exception as e:
print(f"读取文件时出错:{e}")
return []
def get_filename_from_url(url):
"""从URL中提取文件名"""
# URL解码
decoded_url = urllib.parse.unquote(url)
# 提取文件名
filename = os.path.basename(decoded_url)
# 清理文件名中的特殊字符
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
return filename
def download_image(url, output_dir, proxies, session, max_retries=3):
"""下载单个图片"""
filename = get_filename_from_url(url)
filepath = os.path.join(output_dir, filename)
# 如果文件已存在,跳过下载
if os.path.exists(filepath):
print(f"跳过已存在的文件: {filename}")
return True
for attempt in range(max_retries):
try:
print(f"正在下载: {filename} (尝试 {attempt + 1}/{max_retries})")
# 发送请求下载图片
response = session.get(url, proxies=proxies, timeout=30, stream=True)
response.raise_for_status()
# 保存图片
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print(f"✓ 下载成功: {filename}")
return True
except requests.exceptions.RequestException as e:
print(f"✗ 下载失败 (尝试 {attempt + 1}/{max_retries}): {filename}")
print(f" 错误: {e}")
if attempt < max_retries - 1:
print(f" 等待3秒后重试...")
time.sleep(3)
else:
print(f" 已达到最大重试次数,跳过此文件")
return False
except Exception as e:
print(f"✗ 意外错误: {filename} - {e}")
return False
def main():
# 配置
input_file = "test.py"
output_dir = "downloaded_images"
proxy_url = "127.0.0.1:7890"
# 设置代理
proxies = {
'http': f'http://{proxy_url}',
'https': f'http://{proxy_url}'
}
print(f"使用代理: {proxy_url}")
print(f"输出目录: {output_dir}")
print("-" * 50)
# 创建输出目录
Path(output_dir).mkdir(exist_ok=True)
# 从文件中提取URL
urls = extract_urls_from_file(input_file)
if not urls:
print("未找到任何Wikimedia链接")
return
print(f"找到 {len(urls)} 个图片链接")
print("-" * 50)
# 创建会话以复用连接
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
# 下载统计
success_count = 0
total_count = len(urls)
# 逐个下载图片
for i, url in enumerate(urls, 1):
print(f"\n[{i}/{total_count}]", end=" ")
if download_image(url, output_dir, proxies, session):
success_count += 1
# 添加小延迟避免请求过于频繁
if i < total_count:
time.sleep(1)
# 输出统计结果
print("\n" + "=" * 50)
print(f"下载完成!")
print(f"成功: {success_count}/{total_count}")
print(f"失败: {total_count - success_count}/{total_count}")
print(f"图片保存在: {os.path.abspath(output_dir)}")
if __name__ == "__main__":
main()
附赠一些画作的链接:
PYTHON
art = """
https://upload.wikimedia.org/wikipedia/commons/e/ea/Van_Gogh_-_Starry_Night_-_Google_Art_Project.jpg
https://upload.wikimedia.org/wikipedia/commons/7/79/Noon%2C_rest_from_work_-_Van_Gogh.jpeg
https://upload.wikimedia.org/wikipedia/commons/d/d5/Vincent_Van_Gogh_0014.jpg
https://upload.wikimedia.org/wikipedia/commons/1/14/Van_Gogh_-_Bl%C3%BChender_Pfirsichbaum.jpeg
https://upload.wikimedia.org/wikipedia/commons/9/93/Caspar_David_Friedrich_-_Meeresk%C3%BCste_im_Mondlicht%2C_1818.jpg
https://upload.wikimedia.org/wikipedia/commons/0/0c/Caspar_David_Friedrich_-_Das_Eismeer_-_Hamburger_Kunsthalle_-_02.jpg
https://upload.wikimedia.org/wikipedia/commons/2/21/Caspar_David_Friedrich_-_Der_M%C3%B6nch_am_Meer_-_Google_Art_Project.jpg
https://upload.wikimedia.org/wikipedia/commons/5/59/Monet_-_Impression%2C_Sunrise.jpg
https://upload.wikimedia.org/wikipedia/commons/b/b1/%27The_Beach_at_%C3%89tretat%27_by_Claude_Monet%2C_1885-86%2C_Pushkin_Museum.jpg
https://upload.wikimedia.org/wikipedia/commons/7/78/Claude_Monet_-_The_Magpie_-_Google_Art_Project.jpg
https://upload.wikimedia.org/wikipedia/commons/6/6f/Monet_-_Charing_Cross_Bridge_Fog_on_the_Thames%2C_1903.jpg
https://upload.wikimedia.org/wikipedia/commons/9/93/Albert_Bierstadt_-_Puget_Sound_on_the_Pacific_Coast_%281870%29.jpg
https://upload.wikimedia.org/wikipedia/commons/2/28/Sunrise_on_the_Matterhorn_MET_DT218107.jpg
https://upload.wikimedia.org/wikipedia/commons/8/8c/Looking_Down_Yosemite-Valley.jpg
https://upload.wikimedia.org/wikipedia/commons/5/5f/Bierstadt_-_Among_the_Sierra_Nevada_Mountains_-_1868.jpg
https://upload.wikimedia.org/wikipedia/commons/8/83/The_Swing_%28P430%29.jpg
https://upload.wikimedia.org/wikipedia/commons/4/45/Canaletto_-_The_Stonemason%27s_Yard.jpg
https://upload.wikimedia.org/wikipedia/commons/b/b8/Rome%2C_a_view_of_the_Tiber%2C_Castel_Sant%27Angelo%2C_Ponte_Sant%27Angleo%2C_Saint_Peter%27s_Basilica_%28by_Hendrik_Frans_van_Lint%29.jpg
https://upload.wikimedia.org/wikipedia/commons/1/19/Charles-Fran%C3%A7ois_Daubigny_-_Harvest_-_Google_Art_Project.jpg
https://upload.wikimedia.org/wikipedia/commons/0/00/Camille_Pissarro%2C_The_Garden_of_the_Tuileries_on_a_Winter_Afternoon%2C_1899.jpg
https://upload.wikimedia.org/wikipedia/commons/8/8c/David_-_The_Death_of_Socrates.jpg
https://upload.wikimedia.org/wikipedia/commons/1/1e/Jacques-Louis_David_-_The_Coronation_of_Napoleon_%281805-1807%29.jpg
"""