| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 
 | """================================
 作者:IT小章
 网站:itxiaozhang.com
 时间:2024年12月01日
 Copyright © 2024 IT小章
 ================================
 """
 
 import threading
 import tkinter as tk
 from tkinter import ttk, messagebox
 import requests
 import time
 import json
 from bs4 import BeautifulSoup
 import logging
 from datetime import datetime
 import random
 import re
 from queue import Queue
 
 
 logging.basicConfig(
 level=logging.INFO,
 format='%(asctime)s - %(levelname)s - %(message)s'
 )
 
 class Config:
 """配置管理类"""
 
 CITIES = {
 "北京": {"北京": "bj|1"},
 "上海": {"上海": "sh|2"},
 "广州": {"广州": "gz|3"},
 "深圳": {"深圳": "sz|4"}
 }
 
 
 HOUSE_TYPES = {
 "商铺": "/shangpucz/0/",
 "写字楼": "/zhaozu/0/",
 "厂房": "/changfang/0/",
 "生意转让": "/shengyizr/0/"
 }
 
 @staticmethod
 def load_config(file_path="config.json"):
 """加载配置文件"""
 try:
 with open(file_path, 'r', encoding='utf-8') as f:
 return json.load(f)
 except FileNotFoundError:
 logging.warning(f"配置文件 {file_path} 不存在,使用默认配置")
 return {}
 
 class ProxyPool:
 """代理IP池管理类"""
 def __init__(self):
 self.proxies = Queue()
 self.lock = threading.Lock()
 
 def add_proxy(self, proxy):
 """添加代理"""
 self.proxies.put(proxy)
 
 def get_proxy(self):
 """获取代理"""
 try:
 return self.proxies.get()
 except:
 return None
 
 def remove_proxy(self, proxy):
 """移除失效代理"""
 with self.lock:
 if proxy in self.proxies.queue:
 self.proxies.queue.remove(proxy)
 
 class DataStorage:
 """数据存储类"""
 def __init__(self):
 self.data_queue = Queue()
 
 def save(self, data):
 """保存数据
 实际使用时请实现具体的存储逻辑
 """
 logging.info(f"保存数据: {data}")
 self.data_queue.put(data)
 
 class HouseCrawler:
 """房产信息爬虫类"""
 def __init__(self):
 self.proxy_pool = ProxyPool()
 self.storage = DataStorage()
 self.headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
 }
 
 def get_page(self, url, retry_times=3):
 """获取页面内容"""
 for _ in range(retry_times):
 try:
 proxy = self.proxy_pool.get_proxy()
 response = requests.get(url, headers=self.headers, proxies=proxy, timeout=10)
 response.encoding = 'utf-8'
 if response.status_code == 200:
 return response.text
 except Exception as e:
 logging.error(f"获取页面失败: {e}")
 if proxy:
 self.proxy_pool.remove_proxy(proxy)
 return None
 
 def parse_list_page(self, html):
 """解析列表页"""
 if not html:
 return []
 
 try:
 soup = BeautifulSoup(html, 'lxml')
 items = soup.select('ul .item')
 results = []
 
 for item in items:
 try:
 title = item.select_one('.title').text.strip()
 link = item.select_one('.link')['href']
 results.append({
 'title': title,
 'link': link
 })
 except Exception as e:
 logging.error(f"解析列表项失败: {e}")
 
 return results
 except Exception as e:
 logging.error(f"解析列表页失败: {e}")
 return []
 
 def parse_detail_page(self, html):
 """解析详情页"""
 if not html:
 return None
 
 try:
 
 data = {
 'title': '',
 'price': '',
 'area': '',
 'location': '',
 'contact': '',
 'description': ''
 }
 
 soup = BeautifulSoup(html, 'lxml')
 
 
 
 return data
 except Exception as e:
 logging.error(f"解析详情页失败: {e}")
 return None
 class GUI:
 """图形界面类"""
 def __init__(self):
 self.root = tk.Tk()
 self.root.title("58同城房产信息采集系统 (学习版)")
 self.root.geometry('500x400')
 self.crawler = HouseCrawler()
 self.setup_gui()
 
 def setup_gui(self):
 """设置GUI界面"""
 
 tk.Label(self.root, text="选择城市:").grid(row=0, column=0, padx=5, pady=5)
 self.city_var = tk.StringVar()
 self.city_combo = ttk.Combobox(self.root, textvariable=self.city_var, state="readonly")
 self.city_combo['values'] = list(Config.CITIES.keys())
 self.city_combo.current(0)
 self.city_combo.grid(row=0, column=1, padx=5, pady=5)
 
 
 tk.Label(self.root, text="房产类型:").grid(row=1, column=0, padx=5, pady=5)
 self.type_var = tk.StringVar()
 self.type_combo = ttk.Combobox(self.root, textvariable=self.type_var, state="readonly")
 self.type_combo['values'] = list(Config.HOUSE_TYPES.keys())
 self.type_combo.current(0)
 self.type_combo.grid(row=1, column=1, padx=5, pady=5)
 
 
 tk.Label(self.root, text="采集页数:").grid(row=2, column=0, padx=5, pady=5)
 self.pages_var = tk.StringVar(value="1")
 self.pages_entry = tk.Entry(self.root, textvariable=self.pages_var)
 self.pages_entry.grid(row=2, column=1, padx=5, pady=5)
 
 
 tk.Label(self.root, text="间隔时间(秒):").grid(row=3, column=0, padx=5, pady=5)
 self.interval_var = tk.StringVar(value="5")
 self.interval_entry = tk.Entry(self.root, textvariable=self.interval_var)
 self.interval_entry.grid(row=3, column=1, padx=5, pady=5)
 
 
 self.status_text = tk.Text(self.root, height=10, width=50)
 self.status_text.grid(row=4, column=0, columnspan=2, padx=5, pady=5)
 
 
 self.start_button = tk.Button(self.root, text="开始采集", command=self.start_crawl)
 self.start_button.grid(row=5, column=0, padx=5, pady=5)
 
 self.stop_button = tk.Button(self.root, text="停止采集", command=self.stop_crawl)
 self.stop_button.grid(row=5, column=1, padx=5, pady=5)
 
 
 self.is_running = False
 
 def log_message(self, message):
 """显示日志信息"""
 self.status_text.insert(tk.END, f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - {message}\n")
 self.status_text.see(tk.END)
 
 def start_crawl(self):
 """开始采集"""
 if self.is_running:
 messagebox.showwarning("警告", "采集任务正在进行中")
 return
 
 try:
 pages = int(self.pages_var.get())
 interval = int(self.interval_var.get())
 if pages < 1 or interval < 1:
 raise ValueError
 except ValueError:
 messagebox.showerror("错误", "请输入有效的页数和间隔时间")
 return
 
 self.is_running = True
 threading.Thread(target=self.crawl_task, args=(pages, interval)).start()
 self.log_message("开始采集任务...")
 
 def stop_crawl(self):
 """停止采集"""
 self.is_running = False
 self.log_message("正在停止采集任务...")
 
 def crawl_task(self, pages, interval):
 """采集任务"""
 city = self.city_var.get()
 house_type = self.type_var.get()
 
 try:
 for page in range(1, pages + 1):
 if not self.is_running:
 break
 
 self.log_message(f"正在采集第 {page} 页...")
 url = self.generate_url(city, house_type, page)
 
 
 html = self.crawler.get_page(url)
 if not html:
 self.log_message(f"获取第 {page} 页失败,跳过...")
 continue
 
 
 items = self.crawler.parse_list_page(html)
 self.log_message(f"第 {page} 页发现 {len(items)} 条房源信息")
 
 
 for item in items:
 if not self.is_running:
 break
 
 
 detail_html = self.crawler.get_page(item['link'])
 if detail_html:
 detail_data = self.crawler.parse_detail_page(detail_html)
 if detail_data:
 self.crawler.storage.save(detail_data)
 self.log_message(f"成功采集: {item['title']}")
 
 
 time.sleep(interval)
 
 except Exception as e:
 self.log_message(f"采集过程出错: {str(e)}")
 finally:
 self.is_running = False
 self.log_message("采集任务已完成")
 
 def generate_url(self, city, house_type, page):
 """生成目标URL"""
 city_code = Config.CITIES[city][city].split('|')[0]
 type_path = Config.HOUSE_TYPES[house_type]
 return f"https://{city_code}.58.com{type_path}pn{page}/"
 
 def run(self):
 """运行GUI"""
 self.root.mainloop()
 
 def main():
 """主程序入口"""
 try:
 
 app = GUI()
 app.run()
 except Exception as e:
 logging.error(f"程序运行出错: {str(e)}")
 messagebox.showerror("错误", f"程序运行出错: {str(e)}")
 
 if __name__ == "__main__":
 """
 使用说明:
 1. 安装依赖包:
 pip install requests beautifulsoup4 lxml
 
 2. 运行程序:
 python crawler.py
 
 3. 使用步骤:
 - 选择目标城市
 - 选择房产类型
 - 设置采集页数
 - 设置采集间隔时间
 - 点击"开始采集"
 
 4. 注意事项:
 - 采集间隔建议设置在5秒以上
 - 采集页数建议从小到大测试
 - 如遇到错误,请查看日志信息
 - 使用代理IP可以提高采集成功率
 
 5. 数据存储:
 - 默认将数据打印到日志
 - 可以修改DataStorage类实现其他存储方式
 
 6. 代理设置:
 - 默认使用直连方式
 - 可以修改ProxyPool类实现代理池功能
 """
 main()
 
 |