本文共 3232 字,大约阅读时间需要 10 分钟。
1、配置scrapy调试
在工程文件下新建python文件夹main.py用于调试项目(当然还可以使用pdb进行调试)
main.py
from scrapy.cmdline import executeimport sysimport ossys.path.append(os.path.dirname(os.path.abspath(__file__)))execute(["scrapy", "crawl", "Buycar"])
2、设置robots为False
3、拿到xsrf
import requestsimport http.cookiejar as cookielibimport refrom bs4 import BeautifulSoupagent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"header = { "HOST": "www.zhihu.com", "Referer": "https://www.zhihu.com/", "User-Agent": agent #之间是短线短线!!!!,不是下划线}def get_xsrf(): response = requests.get("https://www.zhihu.com", headers=header) xsrf = response.request._cookies._cookies.get('.zhihu.com').get('/').get('_xsrf').value get_xsrf()
4、登陆全部代码:
import requestsimport http.cookiejarimport resession =requests.session()agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"header = { "HOST": "www.zhihu.com", "Referer": "https://www.zhihu.com/", "User-Agent": agent #之间是短线短线!!!!,不是下划线}def get_xsrf(): response = requests.get('https://www.zhihu.com',headers =header) xsrf = response.request._cookies._cookies.get('.zhihu.com').get('/').get('_xsrf').value return xsrfdef zhihu_login(account,password): post_url = 'https://www.zhihu.com/login/phone_num' post_data = { '_xsrf':get_xsrf(), 'phone':account, 'password':password } response_text = session.post(post_url, data=post_data, headers=header) if response_text.status_code == 200: print('登陆成功') else: print('登陆失败')zhihu_login('18328020353','*****')
一片关webdriver xpath的使用http://toolsqa.com/selenium-webdriver/choosing-effective-xpath/https://blog.csdn.net/passionboyxie/article/details/28632965获取属性值https://blog.csdn.net/xm_csdn/article/details/53390649 dr = driver.find_element_by_id('tooltip')dr.get_attribute('data-original-title') #获取tooltip的内容我们唯一确定的是文本'Profile'将始终包含在此图像的src中,因此我们可以在xpath中使用此提示,如下所示: web.find_element_by_xpath(".//*[@class='Login-content']/form/button/ img [contains(@src,'Profile') ]").click()
常用查找方式
find_element_by_name
find_element_by_id
find_element_by_xpath
find_element_by_link_text
find_element_by_partial_link_text
find_element_by_tag_name
find_element_by_class_name
find_element_by_css_selector
5、处理知乎验证码
class JSMiddleware(object): def process_request(self, request, spider): web = webdriver.Chrome("E:/software/python3.6/chromedriver.exe") try: if spider.name == "DouyuImage": # self.web.get(request.url) web.get(request.url) time.sleep(3) body = web.page_source print("访问:{0}".format(request.url)) print("^" * 50) return HtmlResponse(url=web.current_url, body=body, encoding="utf-8", request=request) except Exception as e: print(e) print("webdriver 失败") return None
python 链接 mysql 的SQL语句中如果含有中文一定要用format
sql = 'select id from question where user = "{0}" and Q_title = "{1}"'.format(item['Q_user'],item['Q_title']),一定要这样写,记住{0}外面是有引号滴,有引号!!!!!!!!