| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 
 | 
 
 
 import os
 import cloudscraper
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, urlparse
 import base64
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
 def download_resource(url, base_url, scraper):
 try:
 response = scraper.get(url, timeout=10)
 if response.status_code == 200:
 content_type = response.headers.get('content-type', '').split(';')[0]
 return f"data:{content_type};base64,{base64.b64encode(response.content).decode('utf-8')}"
 except:
 pass
 return urljoin(base_url, url)
 
 def save_webpage(url, output_file):
 
 scraper = cloudscraper.create_scraper(browser='chrome')
 
 
 response = scraper.get(url)
 soup = BeautifulSoup(response.text, 'html.parser')
 
 
 for tag in soup.find_all(['img', 'script', 'link']):
 if tag.name == 'img' and tag.has_attr('src'):
 tag['src'] = download_resource(tag['src'], url, scraper)
 elif tag.name == 'script' and tag.has_attr('src'):
 tag['src'] = download_resource(tag['src'], url, scraper)
 elif tag.name == 'link' and tag.has_attr('href'):
 tag['href'] = download_resource(tag['href'], url, scraper)
 
 
 for tag in soup.find_all(style=True):
 style = tag['style']
 urls = [u.strip() for u in style.split('url(') if ')' in u]
 for u in urls:
 old_url = u.split(')')[0].strip("'").strip('"')
 new_url = download_resource(old_url, url, scraper)
 style = style.replace(f"url({old_url})", f"url({new_url})")
 tag['style'] = style
 
 
 with open(output_file, 'w', encoding='utf-8') as f:
 f.write(str(soup))
 
 print(f"网页已保存至 {output_file}")
 
 def save_multiple_webpages(urls_and_outputs):
 with ThreadPoolExecutor(max_workers=5) as executor:
 future_to_url = {executor.submit(save_webpage, url, output): url for url, output in urls_and_outputs}
 for future in as_completed(future_to_url):
 url = future_to_url[future]
 try:
 future.result()
 except Exception as exc:
 print(f'{url} 生成过程中产生了一个异常: {exc}')
 
 
 urls_and_outputs = [
 ("https://lasempresas.com.mx/", "saved_webpage_1.html"),
 ("https://indialei.in/", "saved_webpage_2.html"),
 ("https://www.zaubacorp.com/", "saved_webpage_3.html"),
 ]
 
 save_multiple_webpages(urls_and_outputs)
 
 |