1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
| """
================================
作者:IT小章
博客:itxiaozhang.com
时间:2025年7月13日
Copyright © 2024 IT小章
================================
"""
import requests
from lxml import html
import re
import os
# 日志配置
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("hmdb_scraper.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger("hmdb_scraper")
# 会话创建(细节省略)
def create_session(*args, **kwargs):
session = requests.Session()
# ...屏蔽实现...
return session
# 随机User-Agent(细节省略)
def get_random_user_agent():
# ...屏蔽实现...
return "User-Agent-Placeholder"
# 清理URL
def clean_url(url):
# ...屏蔽实现...
return url
# 获取代谢物数据(核心内容已屏蔽)
def get_metabolite_data(url, session=None, max_retries=3):
if session is None:
session = create_session()
url = clean_url(url)
# ...屏蔽核心爬虫逻辑...
return {
"Compound ID": "HMDBXXXXXXX",
"Class": "屏蔽",
"Sub Class": "屏蔽",
"Source": "屏蔽"
}
# 单个HMID处理函数
def process_hmid(hmid, hmid_cache, session, retry_failed=True):
if hmid in hmid_cache:
if retry_failed and "获取失败" in hmid_cache[hmid].values():
logger.info(f"重试失败项: {hmid}")
else:
return hmid_cache[hmid]
url = f"https://hmdb.ca/metabolites/{hmid}"
data = get_metabolite_data(url, session)
hmid_cache[hmid] = data or {
"Compound ID": hmid,
"Class": "获取失败",
"Sub Class": "获取失败",
"Source": "获取失败"
}
return hmid_cache[hmid]
def load_cache(cache_file):
# ...屏蔽缓存读取实现...
return {}
def save_cache(cache_data, cache_file):
# ...屏蔽缓存保存实现...
pass
# 处理Excel文件(调用逻辑保留)
def process_excel_files(max_workers=5, use_cache=True, delay=0.5, retry_failed=True, chunk_size=100):
# ...屏蔽部分实现,仅保留接口调用结构与日志...
logger.info("模拟处理Excel文件...")
time.sleep(1)
logger.info("模拟完成。")
def main():
parser = argparse.ArgumentParser(description='HMDB数据爬取工具(公开版本)')
parser.add_argument('--workers', type=int, default=5)
parser.add_argument('--no-cache', action='store_true')
parser.add_argument('--delay', type=float, default=0.5)
parser.add_argument('--no-retry-failed', action='store_true')
parser.add_argument('--chunk-size', type=int, default=100)
args = parser.parse_args()
logger.info("启动爬虫(敏感实现已屏蔽)")
process_excel_files(
max_workers=args.workers,
use_cache=not args.no_cache,
delay=args.delay,
retry_failed=not args.no_retry_failed,
chunk_size=args.chunk_size
)
if __name__ == "__main__":
main()
|