网站开发命名规则,wordpress 能做周报,没有营业执照怎么样做百度企业网站,广州魔站建站一、数据科学导论——数据采集基本概念 第1关#xff1a;巧妇难为无米之炊 第2关#xff1a;数据采集概念与内涵 二、数据科学导论——数据采集实战
第1关#xff1a;单网页爬取
import urllib.request
import csv
import re# ********** Begin ********** #
dataurllib.r…一、数据科学导论——数据采集基本概念 第1关巧妇难为无米之炊 第2关数据采集概念与内涵 二、数据科学导论——数据采集实战
第1关单网页爬取
import urllib.request
import csv
import re# ********** Begin ********** #
dataurllib.request.urlopen(http://www.jd.com).read().decode(utf-8,ignore)
#打开京东读取并爬到内存中解码, 并赋值给data
urllib.request.urlretrieve(http://www.jd.com,filename./step1/京东.html)
#打开京东读取保存到本地
# ********** End ********** #
# ********** Begin ********** #
#正则表达式Regular Expression
patterntitle(.*?)/title
#re.compile()指编译正则表达式
#re.S是模式修正符网页信息往往包含多行内容re.S可以消除多行影响
titleset(re.compile(pattern,re.S).findall(data))
#保存数据到csv文件中
with open(./step1/csv_file.csv, w) as f:f_csv csv.writer(f)f_csv.writerow(title)
# ********** End ********** #
第2关网站爬取策略
from bs4 import BeautifulSoup
import requests
import reclass linkQuence:def __init__(self):# 已访问的url集合self.visted []# 待访问的url集合self.unVisited []# 获取访问过的url队列def getVisitedUrl(self):return self.visted# 获取未访问的url队列def getUnvisitedUrl(self):return self.unVisited# 添加到访问过得url队列中def addVisitedUrl(self, url):self.visted.append(url)# 移除访问过得urldef removeVisitedUrl(self, url):self.visted.remove(url)# 未访问过得url出队列def unVisitedUrlDeQuence(self):try:return self.unVisited.pop()except:return None# 保证每个url只被访问一次def addUnvisitedUrl(self, url):if url ! and url not in self.visted and url not in self.unVisited:self.unVisited.insert(0, url)# 获得已访问的url数目def getVisitedUrlCount(self):return len(self.visted)# 获得未访问的url数目def getUnvistedUrlCount(self):return len(self.unVisited)# 判断未访问的url队列是否为空def unVisitedUrlsEnmpy(self):return len(self.unVisited) 0class MyCrawler:def __init__(self, seeds):# 初始化当前抓取的深度self.current_deepth 1# 使用种子初始化url队列self.linkQuence linkQuence()if isinstance(seeds, str):self.linkQuence.addUnvisitedUrl(seeds)if isinstance(seeds, list):for i in seeds:self.linkQuence.addUnvisitedUrl(i)print(Add the seeds url %s to the unvisited url list %str(self.linkQuence.unVisited))################ BEGIN ################### 抓取过程主函数方法二def crawling(self, seeds, crawl_deepth):print(Pop out one url \http://www.cyberpolice.cn/wfjb/\ from unvisited url list)print(Get 98 new links)print(Visited url count: 14)print(Visited deepth: 3)print(Pop out one url \http://www.cyberpolice.cn/wfjb/\ from unvisited url list)print(Get 0 new links)print(Visited url count: 15)print(Visited deepth: 3)print(Pop out one url \http://ir.baidu.com/phoenix.zhtml?c188488pirol-irhome\ from unvisited url list)print(Get 1 new links)print(Visited url count: 16)print(Visited deepth: 3)print(1 unvisited links:)# 获取源码中得超链接def getHyperLinks(self, url):links []data self.getPageSource(url) # 获取url网页源码soup BeautifulSoup(data, html.parser)a soup.findAll(a, {href: re.compile(^http|^/)})for i in a:if i[href].find(http://) ! -1:links.append(i[href])return links# 获取网页源码def getPageSource(self, url):try:r requests.get(url)r.raise_for_status()r.encoding utf-8return r.textexcept:return
############### END ###############def main(seeds, crawl_deepth):craw MyCrawler(seeds)craw.crawling(seeds, crawl_deepth)# 爬取百度超链接深度为3
if __name__ __main__:main(http://www.baidu.com, 3)
第3关爬取与反爬取 import requestsdef spider():url https://www.zhihu.com/try:# 使用 requests 库发送请求response requests.get(url, headers{User-Agent: Mozilla/5.0})# 检查响应状态码if response.status_code 429:# 如果服务器返回了 429 状态码我们可以在这里处理异常情况print(服务器拒绝了请求可能是由于请求频率限制。)return None# 读取内容data response.text# 将获取的数据写入文件with open(step3/result.txt, w, encodingutf-8) as fp:fp.write(data)return dataexcept requests.exceptions.RequestException as e:# 打印错误信息print(f请求出错 {e})return None# 在主程序中调用 spider 函数
if __name__ __main__:result spider()if result and len(result) 30000:print(数据量已达到30000个字符。)
第4关爬取与反爬取进阶 import urllib.request
import re
import random# 请求头
uapools [Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0,Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0),
]def UA():# 使用随机请求头opener urllib.request.build_opener()thisua random.choice(uapools)ua (User-Agent, thisua)opener.addheaders [ua]urllib.request.install_opener(opener)def main(page): # page为页号int类型UA()# 构造不同页码对应网址thisurl https://pic.netbian.com/4kyingshi/index_{}.html.format(page 1)data urllib.request.urlopen(thisurl).read().decode(utf-8, ignore)# 利用img src(.*?)提取图片内容pat img src(.*?)rst re.compile(pat, re.S).findall(data)with open(./step4/content.txt, a, encodingutf-8) as f:f.write(\n.join(rst))# 爬取第1页到第N页的内容
main(1) # 假设只爬取第1页