当前位置: 首页 > news >正文

杭州建设网 工程信息网站wordpress设置主题404模板

杭州建设网 工程信息网站,wordpress设置主题404模板,wordpress playlm版权,百科创建说明 插件爬虫相当于二次爬虫,二次加工信息,因为大部分插件信息也是从正规网上去获取数据,这次列举helium插件爬虫案例,其他插件爬虫也是类似这个方式. 需求 1、⽤⾕歌浏览器#xff0c;下载chrome extension#xff1a;“Helium 10 2、登录helium10 3、打开 打开Amazo…说明 插件爬虫相当于二次爬虫,二次加工信息,因为大部分插件信息也是从正规网上去获取数据,这次列举helium插件爬虫案例,其他插件爬虫也是类似这个方式. 需求 1、⽤⾕歌浏览器下载chrome extension“Helium 10 2、登录helium10  3、打开 打开Amazon⾸⻚搜索women clothes https://www.amazon.com/s? kwomenclothescridF0IYFXRNCJHDsprefixwomenclothes%2Caps%2C113refn b_sb_noss_1 点击插件点击Xray得到如上图的弹窗。 针对这个表格⾥的每⼀⾏ 1记录所有的信息序号ProductASINBrandPriceSalesRevenueBSR Seller Country/Region, Fees, Active Sellers, Ratings, Reviews, Size Tier, Buy Box, Fulfillment, Dimensions, Weight, Creation Date)等信息 (2) 在SalesBSR和Reviews旁边如果不是空值的话会有⼀个图标 点进图标选All Time 下载CSV以每个产品的ASIN命名csv⽂件。 ⽐如 B0CRKQ44NH_sales.csv, B0CRKQ44NH_bsr.csv, B0CRKQ44NH_review.csv 也就是说针对每个产品我想要得到⼀个总表和三个分表SalesBSR和Review Count。遍历所有的women clothes产品我⼀共需要10000个产品。 代码 import timefrom platforms.base_platform import ObjectPlatform from util.xpath_operation import SeleniumOperation from util.pd_util import PandasUtil from urllib.parse import urlsplit import ddddocr import os import sys import datetime helium 谷歌插件爬虫 class HeliumExtensionPlatform(ObjectPlatform):name heliumextensionplatformdescribtion config_file %s%s % (name, _config)setting_file %s%s % (name, _setting)def __init__(self, config_file_inputNone, setting_file_inputNone, logNone):super(HeliumExtensionPlatform, self).__init__(load_extension_2True, loginFalse, loggerlog)if not self.driver:print(启动失败...,请根据问题,重新启动)sys.exit(1)self.config_file config_file_input if config_file_input else \%s%s % (self.base_config_package, self.config_file)self.setting_file setting_file_input if setting_file_input else \%s%s % (self.base_setting_package, self.setting_file)self.config_package __import__(self.config_file, fromlistTrue)self.setting_package __import__(self.setting_file, fromlistTrue)self.log logSeleniumOperation.log logself.beans {}def before_run(self):dependencies self.config_package.basic_config[dependencies]for depend in dependencies:self.log.info(加载依赖:{}...., depend)platform_position __import__(dependencies[depend][path], fromlistTrue)if hasattr(platform_position, dependencies[depend][class]):dependency getattr(platform_position,dependencies[depend][class]) # http://blog.csdn.net/d_ker/article/details/53671952dependency_obj dependency(self.driver, logself.log)self.beans[depend] dependency_objself.log.info(加载依赖:{}成功, depend)self.ocr ddddocr.DdddOcr()print(启动%s平台 % HeliumExtensionPlatform.describtion)def run(self):url self.config_package.basic_config[main_url]# 打开亚马逊页面print(打开页面:, url)self.get_url_ignore_exception(url)# 解决打开url有验证码的情况self._input_code(url)page 1print(打开url, url)SeleniumOperation.get_url_ignore_exception(self.driver, self.config_package.basic_config[url])while True:print(第%s页数据获取...... % page)datas self.run_helium_extension()if datas:out_file_name os.path.join(self.config_package.basic_config[out_path],str(datetime.date.today()) _ str(page) .csv)PandasUtil.write_csv_append(datas, out_file_name)elements SeleniumOperation.get_elements(self.driver, self.setting_package.NEXT_PAGE_XPATH)if elements:next_page_element elements[-1]text next_page_element.textif 下一页 or Next in text:next_page_element.click()page 1print(有下一页, 进入下一页,开始爬取第%s页...... % (page))while True:time.sleep(5)url_str self.driver.current_urlpage_str page str(page)if page_str in url_str:self.config_package.basic_config[url] url_strbreakprint(当前url%s,希望url中有关键词%s%(url_str,page_str))continueelse:print(找不到下一页,结束运行)break输入验证码def _input_code(self, url):while True:input_code_element SeleniumOperation.get_element(self.driver, self.setting_package.INPUT_CODE_XPATH)if input_code_element:code self._ocr_code()if not code:print(刷新页面,识别不出来)self.get_url_ignore_exception(url)continueelse:breakelse:returninput_code_element.clear()input_code_element.send_keys(code)SeleniumOperation.click_button_anyway(self.driver, self.setting_package.SUBMIT_CODE_XPATH)def _ocr_code(self):pic_elements SeleniumOperation.get_elements(self.driver, self.setting_package.IMAGE_CODE_XPATH)if len(pic_elements) 1:pic_element pic_elements[0]image_url pic_element.get_attribute(src)import requests# code_file_name os.path.join(self.config_package.basic_config[out_path],# os.path.splitext(# self.config_package.basic_config[file_name].split(/)[-1])[# 0] - str(datetime.time()) .jpg)# with open(code_file_name, modewb) as f:# f.write(requests.get(image_url).content) # 将图片以二进制写入## with open(code_file_name, rb) as f: # 打开图片# img_bytes f.read() # 读取图片res self.ocr.classification(requests.get(image_url).content) # 识别print(识别验证码是:, res)return resreturn Nonedef run_helium_extension(self):print(开始运行helium插件)try:self.beans[helium].before_run(dataself.config_package.basic_config)datas self.beans[helium].run()return datasexcept Exception as e:print(运行helium插件出错了)self.log.exception(e)return Nonefinally:self.beans[helium].after_run()def after_run(self):print(%s 平台已经运行完成请根据log目录查看运行日志\n % HeliumExtensionPlatform.describtion)super(HeliumExtensionPlatform, self).after_run()if __name__ __main__:url https://www.amazon.fr/dp/B0BNW5P4PC?th1netloc urlsplit(url).netlocsubfix_location netloc.split(.)[-1]print(subfix_location)obj HeliumExtensionPlatform()obj.run()插件运行核心代码 import random import timefrom extensions.basic_extension import BasicExtension from util.xpath_operation import SeleniumOperation from settings import ConfigPackage from settings import SettingPackage from settings import DownLoadPath from selenium.webdriver.common.by import By import os import shutil from util.io_util import IOUTILclass Helium10Extension(BasicExtension):name helium10extensionconfig_file %s%s % (name, _config)setting_package %s%s % (name, _setting)def __init__(self, driver, log):super(Helium10Extension, self).__init__()self.config_file %s%s % (ConfigPackage, self.config_file)self.setting_file %s%s % (SettingPackage, self.setting_package)self.config_package __import__(self.config_file, fromlistTrue)self.setting_package __import__(self.setting_file, fromlistTrue)self.log logself.driver driverself.shadow_driver driverSeleniumOperation.log logdef before_run(self, dataNone):# XPathOperation.click_button(self.driver, self.setting_package.HELIUM10_CLICK_XPATH)url data[url]while self.login():passself._get_shadow_dom(url)while True:SeleniumOperation.click_button_anyway(self.shadow_driver, self.setting_package.HELIUM10_CLICK_CSS_PATH,by_typeBy.CSS_SELECTOR)element SeleniumOperation.get_element(self.shadow_driver,self.setting_package.HELIUM10_XRAY_2_CSS_PATH,loadingFalse, by_typeBy.CSS_SELECTOR)if element:SeleniumOperation.click_button(self.shadow_driver,self.setting_package.HELIUM10_XRAY_2_CSS_PATH,by_typeBy.CSS_SELECTOR)breakelse:SeleniumOperation.click_button_anyway(self.shadow_driver, self.setting_package.HELIUM10_CLICK_CSS_PATH,by_typeBy.CSS_SELECTOR)element SeleniumOperation.get_element(self.shadow_driver, self.setting_package.LOGIN_CSS_PATH,loadingFalse, by_typeBy.CSS_SELECTOR)if element:print(需要重新登录....)self.login()self._get_shadow_dom(url)def _get_shadow_dom(self, url):while True:element SeleniumOperation.get_element(self.driver, self.setting_package.AMAZION_XPATH)if element:self.shadow_driver SeleniumOperation.get_shadow_root_js(self.driver,self.setting_package.SHADOW_CSS_PATH)if self.shadow_driver:returnprint(刷新页面:, url)element SeleniumOperation.get_element(self.driver, self.setting_package.SORRY_XPATH)if element:print(回到首页)SeleniumOperation.click_button_anyway(self.driver, self.setting_package.SORRY_XPATH)time.sleep(10)SeleniumOperation.get_url_ignore_exception(self.driver, url)# def before_refresh(self):# element SeleniumOperation.get_element(self.driver, self.setting_package.SORRY_XPATH)# if element:# SeleniumOperation.get_url_ignore_exception()def run(self):datas, shadow_driver self.run_page(0)# if not next_page:# print(没有下一页,结束运行.....)# return datasprint(关闭x-ray)click_result Falsewhile not click_result:element SeleniumOperation.get_element(shadow_driver, self.setting_package.ALL_CLOSE_CSS_PATH,by_typeBy.CSS_SELECTOR)if not element:print(x-ray已经关闭了)breakclick_result SeleniumOperation.click_button_anyway(shadow_driver, self.setting_package.ALL_CLOSE_CSS_PATH,by_typeBy.CSS_SELECTOR)return datasdef run_page(self, begin_index):datas []count 0while True:shadow_driver SeleniumOperation.get_shadow_root_js(self.driver,self.setting_package.SHADOW_DETAIL_PATH)if shadow_driver:count 1print(X_RAY 树加载出来)elements SeleniumOperation.get_elements(shadow_driver, self.setting_package.TABLE_CSS_PATH,loadingFalse,by_typeBy.CSS_SELECTOR)if elements and len(elements) begin_index:breakprint(等待数据出来.......)element SeleniumOperation.get_element(shadow_driver, self.setting_package.NEW_UI_CSS_PATH,loadingFalse,by_typeBy.CSS_SELECTOR)if element:print(现在是旧的ui,切换新的ui)SeleniumOperation.click_button_anyway(shadow_driver, self.setting_package.NEW_UI_CSS_PATH,loadingFalse,by_typeBy.CSS_SELECTOR)if elements and count 20 and begin_index 0:print(重新点击load mores的按钮)SeleniumOperation.click_button_anyway(shadow_driver, self.setting_package.LOAD_MORE_CSS_PATH,by_typeBy.CSS_SELECTOR)print(等待X-RAY加载完成)time.sleep(5) # 这个X-RAY隐藏树加载有点慢print(找到%s条数据 % len(elements))elements elements[begin_index:]print(只要获取%s条数据 % len(elements))for index, element in enumerate(elements):print(获取第%s条数据 % (index 1))datas.append(self.load_data(element))# # 基于xpath也能找到# child_element SeleniumOperation.get_element(element, self.setting_package.ORDER_XPATH,# loadingTrue)# if child_element:# print(xpath方式找到儿子的元素, child_element.text)print(开始下载csv文件)# 下载文件for index, data in enumerate(datas):print(下载第%s个商品:%s 的csv文件 % (index 1, data[Product]))self.download_all_time_csv(elements[index], data, shadow_driver)# load_more_element SeleniumOperation.get_element(shadow_driver, self.setting_package.LOAD_MORE_CSS_PATH,# by_typeBy.CSS_SELECTOR)# loads_more False# if load_more_element:# print(还有loads more,尝试loads more点击)# loads_more SeleniumOperation.click_button_anyway(shadow_driver, self.setting_package.LOAD_MORE_CSS_PATH,# by_typeBy.CSS_SELECTOR)return datas, shadow_driverdef download_all_time_csv(self, element, data, shadow_driver):while SeleniumOperation.click_button_anyway(element, self.setting_package.COLUMNS_CSS_PATH[BSR] self.setting_package.BSR_CLICK_CSS_PATH,loadingTrue,by_typeBy.CSS_SELECTOR) \or SeleniumOperation.click_button_anyway(element,self.setting_package.COLUMNS_CSS_PATH[Reviews] self.setting_package.REVIEWS_CLICK_CSS_PATH,loadingTrue,by_typeBy.CSS_SELECTOR) \or SeleniumOperation.click_button_anyway(element, self.setting_package.COLUMNS_CSS_PATH[Sales] self.setting_package.SALE_CLICK_CSS_PATH,loadingTrue,by_typeBy.CSS_SELECTOR):print(进入sales、bsr 、reviews趋势图页面)element SeleniumOperation.get_element(shadow_driver, self.setting_package.SALE_CLICK_SWITCH_CSS_PATH,loadingTrue,by_typeBy.CSS_SELECTOR)if element:break## else:# print(商品%s三个地方都不可点击无法下载,ASIN号%s % (data[Product], data[Asin]))# returnfile_name data[Asin] if data[Asin] else random.Random(10000)if not self.download_files_csv(shadow_driver, self.setting_package.SALE_CLICK_SWITCH_CSS_PATH):print(下载商品{}的sales csv文件失败.format(data[Product]))self.log.error(下载商品{}的sales csv文件失败.format(data[Product]))else:self.wait_loaded_and_rename(file_name _sales.csv, data)print(下载商品%s的bsr csv文件 % (data[Product]))if not self.download_files_csv(shadow_driver, self.setting_package.BSR_CLICK_SWITCH_CSS_PATH):print(下载商品{}的bsr csv文件失败.format(data[Product]))self.log.error(下载商品{}的bsr csv文件失败.format(data[Product]))else:self.wait_loaded_and_rename(file_name _bsr.csv, data)print(下载商品%s的reviews csv文件 % (data[Product]))if not self.download_files_csv(shadow_driver, self.setting_package.REVIEWS_CLICK_SWITCH_CSS_PATH):print(下载商品%s的reviews csv文件 % (data[Product]))self.log.error(下载商品{}的reviews csv文件失败.format(data[Product]))else:self.wait_loaded_and_rename(file_name _reviews.csv, data)click_result Falsewhile not click_result:print(关闭窗口)element SeleniumOperation.get_element(shadow_driver, self.setting_package.CLOSE_CSS_PATH, loadingTrue,by_typeBy.CSS_SELECTOR)if not element:print(进入sales、bsr 、reviews趋势图页面已经关闭)breakclick_result SeleniumOperation.click_button_anyway(shadow_driver, self.setting_package.CLOSE_CSS_PATH,loadingTrue, by_typeBy.CSS_SELECTOR)def download_files_csv(self, shadow_driver, css_path):click_result SeleniumOperation.click_button_anyway(shadow_driver, css_path, loadingTrue,by_typeBy.CSS_SELECTOR)if click_result:# 选择 all——time时间while True:times_elements SeleniumOperation.get_elements(shadow_driver, self.setting_package.ALL_TIME_CSS_PATH,loadingFalse,by_typeBy.CSS_SELECTOR)if times_elements and len(times_elements) 1:times_elements[-1].click()breaktime.sleep(5)# 点击下载入口click_result SeleniumOperation.click_button_anyway(shadow_driver,self.setting_package.DOWNLOAD_ENTRY_CSS_PATH,by_typeBy.CSS_SELECTOR)while not click_result:time.sleep(5)click_result SeleniumOperation.click_button_anyway(shadow_driver,self.setting_package.DOWNLOAD_ENTRY_CSS_PATH,by_typeBy.CSS_SELECTOR)# 下载csv文件return SeleniumOperation.click_button_anyway(shadow_driver, self.setting_package.DOWNLOAD_CSV_CSS_PATH,by_typeBy.CSS_SELECTOR)def wait_loaded_and_rename(self, filename, data):download_file_name SeleniumOperation.get_downloaded_filename(self.driver, 5) # wait_time 根据实际需要进行调整if download_file_name:try:shutil.move(os.path.join(DownLoadPath, download_file_name),os.path.join(DownLoadPath, filename))print(文件:%s下载完成 % (filename))returnexcept:passlast_file, last_name IOUTIL.get_last_filename(DownLoadPath)if _sales in last_name or _bsr in last_name or _reviews in last_name:print(下载商品{}的sales csv文件失败.format(data[Product]))self.log.error(下载商品{}的sales csv文件失败.format(data[Product]))returnresult IOUTIL.rename(last_file, last_name, filename)if not result:self.log.error(文件:{}修改成新文件名{}出错了请手动修改.format(last_name, filename))def load_data(self, element):element_data {}for key, values in self.setting_package.COLUMNS_CSS_PATH.items():child_element SeleniumOperation.get_element(element, values,loadingTrue,by_typeBy.CSS_SELECTOR)if child_element:text child_element.textelement_data[key] textelse:print(找不到%s的元素,请查询是否path问题,默认设置为空)element_data[key] Nonereturn element_datadef after_run(self, dataNone):passdef _is_login(self):# 判断是否登陆了element SeleniumOperation.get_element(self.shadow_driver, self.setting_package.HELIUM10_CLICK_CSS_PATH,by_typeBy.CSS_SELECTOR)return True if element else Falsedef login(self):try:print(进入尝试自动登录.......)current_url self.driver.current_urlif https://members.helium10.com/dashboard?accountId in current_url:print(已经登录成功)return FalseSeleniumOperation.get_url_ignore_exception(self.driver, self.config_package.basic_config[login_url])element SeleniumOperation.get_element(self.driver, self.setting_package.EMAIL_XPATH)if not element:print(找不到登录输入信息, 可能已经登录成功)return Falseelement.send_keys(self.config_package.basic_config[email])element SeleniumOperation.get_element(self.driver, self.setting_package.PASSWORD_XPATH)element.send_keys(self.config_package.basic_config[password])element SeleniumOperation.get_element(self.driver, self.setting_package.CAPTCHAID_XPATH)if element:input(请在页面手动操作登陆验证码,然后输入任意按键继续:)SeleniumOperation.click_button(self.driver, self.setting_package.LOGIN_XPATH)return Falseexcept:self.log.exception(登录失败)return True
http://www.dnsts.com.cn/news/142734.html

相关文章:

  • 注册建筑工程公司需要什么条件seo怎么去做
  • 做网站外包公司学网站开发难吗
  • 网站模板html5高校后勤网站建设
  • 网站的导航栏360推广登陆入口
  • 网站备案号怎么放友汇网站建设一般多少钱
  • php音乐外链网站源码网站建设岗位的认知
  • 东莞seo技术培训seo教程自学入门教材
  • 抚顺网站设计wordpress 内容发布
  • 用asp做的网站运行完之后怎么生成一个可以打开的网站图标怎样向顾客电销网站建设
  • 网站建设推广的软文加国无忧51工作网
  • 网站备案背景幕布尺寸微商怎么开店步骤
  • 东莞网站建设员如何做网站 站长教课
  • 海网站建设生产厂家哪家好新零售分销系统开发
  • 徐州网站建设优化宣传合肥网站建设网页设计
  • 吉林省建设工程质量监督站网站免费下载建网站教程
  • 简单网页模板代码c盘优化大师
  • 做网站的服务商网站开发维护人员
  • 针对网站开发软件 代替手动电商网站建设过程
  • 移动无线宽带怎么续费北京优化公司排行
  • 嵌入式和网站开发python做的网站如何打开
  • 网站域名价格 优帮云网站视频外链怎么做
  • 网页设计与网站建设在线作业答案系部网站开发计划
  • wordpress 游戏主题下载深圳优化公司公认安高粱seo
  • gta5卖公司显示网站正在建设中郑州做网站优化电话
  • 花都网站建设价格做网站的市场有那么大吗
  • 不能上传图片到网站希音跨境电商官网
  • 网站图片上传不上去怎么办黄岛区做网站多少钱
  • 干网站建设销售怎么样河北石家庄网络公司
  • 企业网站建设报价明细表百度seo关键词优化排名
  • 网站哪些页面会做静态化西安软件公司排名