1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
|
import os import cloudscraper from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse import base64 from concurrent.futures import ThreadPoolExecutor, as_completed
def download_resource(url, base_url, scraper): try: response = scraper.get(url, timeout=10) if response.status_code == 200: content_type = response.headers.get('content-type', '').split(';')[0] return f"data:{content_type};base64,{base64.b64encode(response.content).decode('utf-8')}" except: pass return urljoin(base_url, url)
def save_webpage(url, output_file): scraper = cloudscraper.create_scraper(browser='chrome') response = scraper.get(url) soup = BeautifulSoup(response.text, 'html.parser')
for tag in soup.find_all(['img', 'script', 'link']): if tag.name == 'img' and tag.has_attr('src'): tag['src'] = download_resource(tag['src'], url, scraper) elif tag.name == 'script' and tag.has_attr('src'): tag['src'] = download_resource(tag['src'], url, scraper) elif tag.name == 'link' and tag.has_attr('href'): tag['href'] = download_resource(tag['href'], url, scraper)
for tag in soup.find_all(style=True): style = tag['style'] urls = [u.strip() for u in style.split('url(') if ')' in u] for u in urls: old_url = u.split(')')[0].strip("'").strip('"') new_url = download_resource(old_url, url, scraper) style = style.replace(f"url({old_url})", f"url({new_url})") tag['style'] = style
with open(output_file, 'w', encoding='utf-8') as f: f.write(str(soup))
print(f"网页已保存至 {output_file}")
def save_multiple_webpages(urls_and_outputs): with ThreadPoolExecutor(max_workers=5) as executor: future_to_url = {executor.submit(save_webpage, url, output): url for url, output in urls_and_outputs} for future in as_completed(future_to_url): url = future_to_url[future] try: future.result() except Exception as exc: print(f'{url} 生成过程中产生了一个异常: {exc}')
urls_and_outputs = [ ("https://lasempresas.com.mx/", "saved_webpage_1.html"), ("https://indialei.in/", "saved_webpage_2.html"), ("https://www.zaubacorp.com/", "saved_webpage_3.html"), ]
save_multiple_webpages(urls_and_outputs)
|