json文件:完整json文件过大,这里只截取了部分数据进行展示
# city.json { "郑州": "https://www.dianping.com/zhengzhou", "珠海": "https://www.dianping.com/zhuhai", "张家口": "https://www.dianping.com/zhangjiakou" } # menu.json { "美食": "https://www.dianping.com/{}/ch10", "丽人": "https//:www.dianping.com/{}/beauty", "周边游": "https//:www.dianping.com/{}/ch35", } """menu.json这个文件通过后面代码自动生成,生成格式如上所示""" # cookies.json [{}] """这里涉及到隐私问题就没把cookies展示出来, 下面会一步步带领大家如何自动获得可用的cookies 并且保存到本地需要时直接调用"""
@echo off cd "C:\Program Files\Google\Chrome\Application" start chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\selenium\AutomationProfile"
#!/usr/bin/env python3 # coding:utf-8 import subprocess import bag import time import random # batch_file_content = r''' # @echo off # cd "C:\Program Files\Google\Chrome\Application" # start chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\selenium\AutomationProfile" # ''' # # with open('run_chrome.bat', 'w') as f: # f.write(batch_file_content) subprocess.Popen('run_chrome.bat', shell=True) web = bag.Bag.web_debug() web.get(r'https://www.dianping.com/') time.sleep(random.randint(5, 10)) cookie = web.get_cookies() web.close() bag.Bag.save_json(cookie, r'./cookies.json')
新建一个文本文件,将第一个代码复制过去并修改后缀为.bat文件,至于为什么要这样做呢,主要是因为这样做了后可以用subprocess来控制程序
运行下面的代码一个可用的cookies便会自动生成
#!/usr/bin/env python3 # coding:utf-8 import bag from bs4 import BeautifulSoup import re session = bag.session.create_session() for cookie in bag.Bag.read_json(r'./cookies.json'): session.cookies.set(cookie['name'], cookie['value']) # 输入需要爬取的城市名称 def choose_city(): js_data = bag.Bag.read_json('./city.json') choose = input('输入城市名:') judge = js_data.get(choose) # 判断输入的城市是否存在 # pattern = re.compile(r'(.*?)', re.S) pattern = re.compile(r'(.*?)', re.S) dic = {} if judge: resp = session.get(judge) html = BeautifulSoup(resp.text, 'lxml') soup = html.findAll('span', class_='span-container') for info in soup: data = re.findall(pattern, str(info)) mid: list = data[0][0].split('/') mid[-2] = '{}' dic[data[0][1]] = 'https:' + ''.join(mid) else: print('无效输入!') choose_city() print(dic) # 根据输入信息得到的生成结果 '''输入城市名:珠海 { "美食": "https:www.dianping.com{}ch10", "休闲娱乐": "https:www.dianping.com{}ch30", "结婚": "https:www.dianping.com{}wedding", "电影演出赛事": "https:www.dianping.com{}movie", "丽人": "https:www.dianping.com{}beauty", "酒店": "https:www.dianping.com{}hotel", "亲子": "https:www.dianping.com{}baby", "周边游": "https:www.dianping.com{}ch35", "运动健身": "https:www.dianping.com{}ch45", "购物": "https:www.dianping.com{}ch20", "家装": "https:www.dianping.com{}home", "学习培训": "https:www.dianping.com{}education", "生活服务": "https:www.dianping.com{}ch80", "医疗健康": "https:www.dianping.com{}ch85", "爱车": "https:www.dianping.com{}ch65", "宠物": "https:www.dianping.com{}ch95" }''' bag.Bag.save_json(dic, r'./menu.json') if __name__ == '__main__': choose_city()
# choose.py # !/usr/bin/env python3 # coding:utf-8 import bag def choose_city(): session = bag.session.create_session() for cookie in bag.Bag.read_json(r'./cookies.json'): session.cookies.set(cookie['name'], cookie['value']) session.headers['Connection'] = 'close' js_data = bag.Bag.read_json('./city.json') choose = input('输入城市名:') judge = js_data.get(choose) if judge: city = judge.split('/')[-1] choose_1 = input('输入爬取类类型:') js_data1 = bag.Bag.read_json('./menu.json') judge1 = js_data1.get(choose_1) if judge1: return judge1.format(city), session else: print('开发中......') return None else: print('无效输入!') return None
# get_shop.py # !/usr/bin/env python3 # coding:utf-8 import bag import chooses import re from bs4 import BeautifulSoup from tqdm import tqdm import requests proxies = { "http": "http://{}:{}", } def check(): url_ = r'https://www.dianping.com/zhuhai/ch10' ip_ls = bag.Bag.read_json('../代理ip/IP地址.json') index = 0 if len(ip_ls) == 0: print('IP地址全部失效') exit() for ip_address in ip_ls: proxies_ = { "http": "{}:{}".format(ip_address[0], ip_address[1]), } resp = session.get(url_, proxies=proxies_) if resp.status_code == 200: proxies['http'] = proxies['http'].format(ip_address[0], ip_address[1]) # 创建虚拟IP bag.Bag.save_json(ip_ls[index:], r'../代理ip/IP地址.json') print(f'[{index}] 更换ip成功') return index += 1 url, session = chooses.choose_city() def get_types(): # 正常传参 check() pattern = re.compile(r'(.*?)', re.S) if bool(url): resp = session.get(url, proxies=proxies) html = BeautifulSoup(resp.text, 'lxml') soup = html.findAll('div', id='classfy') links = re.findall(pattern, str(soup)) return links else: check() get_types() def get_shop(): links = get_types() pattern = re.compile(r'.*?.*?.*?(.*?))?' r'(?:.*?(.*?))?' r'(?:.*?.*?(.*?).*?.*?(.*?))?', re.S) number = re.compile(r'data-ga-page="(.*?)"', re.S) result = [] if not bool(links): print('获取异常') return for link in links: # 获取第一页 try: resp = session.get(link[0], proxies=proxies) page = [int(i) for i in re.findall(number, resp.text)] page_num = sorted(page, reverse=True)[0] html = BeautifulSoup(resp.text, 'lxml') soup = html.findAll('li', class_='') for i in soup: for j in re.findall(pattern, str(i)): result.append(j) if page_num >= 2: # 获取第一页往后 for count in tqdm(range(page_num)[1:]): try: resp1 = session.get(link[0]+'p{}'.format(count+1), proxies=proxies) html1 = BeautifulSoup(resp1.text, 'lxml') soup1 = html1.findAll('li', class_='') for k in soup1: info = pattern.search(str(k)) if info: groups = list(info.groups()) for i in range(len(groups)): if not groups[i]: groups[i] = 'null' result.append(tuple(groups)) except requests.exceptions.RequestException as e: print(e) check() except Exception as e: print(e) continue else: pass except requests.exceptions.RequestException as e: print(e) check() except Exception as e: print(e) check() return result end = get_shop() bag.Bag.save_excel(end, './商店.xlsx')