用CMS做网站的好处,软文营销范文100字,如何网站防止采集,聚美优品返利网站怎么做项目介绍
【开源】项目基于pythonpandasflaskmysql等技术实现豆瓣电影数据获取及可视化分析展示#xff0c;觉得有用的朋友可以来个一键三连#xff0c;感谢#xff01;#xff01;#xff01;
项目演示 【开源】2024最新python豆瓣电影数据爬虫可视化分析项目 项目截图…项目介绍
【开源】项目基于pythonpandasflaskmysql等技术实现豆瓣电影数据获取及可视化分析展示觉得有用的朋友可以来个一键三连感谢
项目演示 【开源】2024最新python豆瓣电影数据爬虫可视化分析项目 项目截图
首页 列表页 爬虫演示
项目地址
https://github.com/mudfish/python-douban-view
项目结构 核心模块
电影爬虫 异步并发爬虫
# 本次运行获取的最大页数
MAX_PAGES 5
# 进度控制文件
PAGE_PROGRESS_FILE page_progress.json
# 电影类型
MOVIE_TYPES [剧情, 喜剧, 动作, 爱情, 科幻, 动画]
# CSV文件名
CSV_NAME movie_data.csv
# CSV头
CSV_HEADS [id,movie_id,title,year,directors,casts,rating,cover,country,summary,types,lang,release_date,time,url,
]
# 上映日期匹配正则剔除非数字和-
RELEASE_DATE_REMOVE_RE r[^0-9-]engine create_engine(mysqlpymysql://root:123456127.0.0.1:3306/db_douban)def get_id():return str(random.randint(1, 100000000)) str(time.time()).split(.)[1].strip()class Spider:def __init__(self):self.movie_page_url https://m.douban.com/rexxar/api/v2/movie/recommend?self.movie_detail_url https://movie.douban.com/subject/{}/self.headers {User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36,Referer: https://movie.douban.com/explore,}self.movie_types MOVIE_TYPESself.page_progress {}# 需要抓取的页面数self.total_pages 0self.completed_pages 0self.global_progress_bar Nonedef init(self):# 每次跑之前先删除之前的csv文件if os.path.exists(CSV_NAME):os.remove(CSV_NAME)with open(CSV_NAME, w, newline, encodingutf-8) as writer_f:writer csv.writer(writer_f)writer.writerow(CSV_HEADS)def load_page_progress(self):if os.path.exists(PAGE_PROGRESS_FILE):with open(PAGE_PROGRESS_FILE, r, encodingutf-8) as f:# 判断文件内容是否为空if os.stat(PAGE_PROGRESS_FILE).st_size 0:# 初始化页面进度print(初始化页面进度)self.page_progress {}self.save_page_progress()else:self.page_progress json.load(f)def save_page_progress(self):with open(PAGE_PROGRESS_FILE, w, encodingutf-8) as f:json.dump(self.page_progress, f, ensure_asciiFalse)async def get_movie_pages(self, session, type_name):start_page self.page_progress.get(type_name, 1)if start_page MAX_PAGES:for page in range(start_page, MAX_PAGES 1):# print(f{type_name}第{page}页)start_time time.time()params {start: (page - 1) * 20, count: 10, tags: type_name}try:async with session.get(self.movie_page_url, headersself.headers, paramsparams) as resp:resp.raise_for_status()respJson await resp.json()movie_list respJson[items]for i, m in enumerate(movie_list):if m[type] movie:await self.process_movie(session, m)# progress_bar.update(round(1/len(movie_list)))self.page_progress[type_name] page 1# 记录进度self.save_page_progress()# 刷新全局进度self.update_global_progress()except Exception as e:print(f处理:{type_name}第{page}页失败: {e})traceback.print_exc()continueasync def process_movie(self, session, movie):movie_data []movie_data.append(get_id())movie_data.append(movie[id])movie_data.append(movie[title])movie_data.append(movie[year])async with session.get(self.movie_detail_url.format(movie[id]), headersself.headers) as resp:resp.raise_for_status()html_text await resp.text()path etree.HTML(html_text)# 导演movie_data.append(,.join(path.xpath(//a[relv:directedBy]/text())))# 主演movie_data.append(,.join(path.xpath(//a[relv:starring]/text())))# 评分movie_data.append(path.xpath(//strong[propertyv:average]/text())[0])# 封面movie_data.append(path.xpath(//img[relv:image]/src)[0])# 国家movie_data.append(path.xpath(//span[contains(text(),制片国家)]/following-sibling::br[1]/preceding-sibling::text()[1])[0].replace( / , ,))# 摘要movie_data.append(path.xpath(//span[propertyv:summary]/text())[0].strip())# 类型movie_data.append(,.join(path.xpath(//div[idinfo]/span[propertyv:genre]/text())))# 语言movie_data.append(path.xpath(//span[contains(text(),语言)]/following-sibling::br[1]/preceding-sibling::text()[1])[0])# 上映日期movie_data.append(re.sub(RELEASE_DATE_REMOVE_RE,,path.xpath(//span[propertyv:initialReleaseDate]/text())[0][:10],))# 时长空处理# print(movie[id])movie_time path.xpath(//span[propertyv:runtime]/text())if len(movie_time) 0:movie_data.append(movie_time[0])else:movie_data.append()# urlmovie_data.append(self.movie_detail_url.format(movie[id]))self.save_to_csv(movie_data)def save_to_csv(self, row):with open(CSV_NAME, a, newline, encodingutf-8) as f:writer csv.writer(f)writer.writerow(row)def clean_csv(self):print(清理数据)df pd.read_csv(CSV_NAME, encodingutf-8)df.drop_duplicates(subset[movie_id], keepfirst, inplaceTrue)print(存储到数据库...)df.to_sql(tb_movie, conengine, indexFalse, if_existsappend)print(清理重复数据...)engine.connect().execute(text(delete t1 from tb_movie t1 inner join (select min(id) as id,movie_id from tb_movie group by movie_id having count(*) 1) t2 on t1.movie_idt2.movie_id where t1.idt2.id))def update_global_progress(self):self.completed_pages 1# print(self.completed_pages)self.global_progress_bar.update(1)self.global_progress_bar.refresh()async def run(self):self.init()self.load_page_progress()# self.total_pages MAX_PAGES*len(MOVIE_TYPES) - sum(self.page_progress.get(type_name, 1) for type_name in MOVIE_TYPES)for type_name in MOVIE_TYPES:if MAX_PAGES self.page_progress.get(type_name, 1):self.total_pages MAX_PAGES 1 - self.page_progress.get(type_name, 1)print(self.total_pages)if self.total_pages 0:self.global_progress_bar tqdm(totalself.total_pages, descprogress, unitpage, colourGREEN)async with aiohttp.ClientSession() as session:tasks [self.get_movie_pages(session, type_name)for type_name in self.movie_types]await asyncio.gather(*tasks)# 请求结束后清空页面进度# self.page_progress {}# self.save_page_progress()self.global_progress_bar.close()self.clean_csv()if __name__ __main__:loop asyncio.get_event_loop()spider Spider()loop.run_until_complete(spider.run())
电影可视化
接口代码
from flask import Flask, render_template, request, redirect, url_for, session
from utils import db_queryapp Flask(__name__)
app.secret_key mysessionkey# 统一请求拦截
app.before_request
def before_request():# 利用正则匹配如果/static开头和/login, /logout,/register的请求则不拦截;其他的判断是否已登录if (request.path.startswith(/static)or request.path /loginor request.path /logoutor request.path /register):return# 如果没有登录则跳转到登录页面if not session.get(login_username):return redirect(url_for(login))# 首页
app.route(/)
def index():# 获取电影统计数据movie_stats db_query.fetch_movie_statistics()# 获取电影分类统计movie_type_distribution db_query.fetch_movie_type_distribution()# 获取电影评分统计movie_rating_distribution db_query.fetch_movie_rating_distribution()print(movie_rating_distribution)return render_template(index.html,login_usernamesession.get(login_username),movie_statsmovie_stats,movie_type_distributionmovie_type_distribution,movie_rating_distributionmovie_rating_distribution,)# 登录
app.route(/login, methods[GET, POST])
def login():if request.method POST:req_params dict(request.form)# 判断用户名密码是否正确sql SELECT * FROM tb_user WHERE username %s AND password %sparams (req_params[username], req_params[password])if len(db_query.query(sql, params)) 0:# 存储sessionsession[login_username] req_params[username]return redirect(url_for(index))else:return render_template(error.html,error用户名或密码错误,)elif request.method GET:return render_template(login.html)# 退出
app.route(/logout)
def logout():session.pop(login_username, None)return redirect(url_for(index))# 注册
app.route(/register, methods[GET, POST])
def register():if request.method POST:req_params dict(request.form)if req_params[password] req_params[password_confirm]:# 判断是否已存在该用户名sql SELECT * FROM tb_user WHERE username %sparams (req_params[username],)result db_query.query(sql, params)if len(result) 0:return render_template(error.html,error用户名已存在,)sql INSERT INTO tb_user (username, password) VALUES (%s, %s)params (req_params[username],req_params[password],)db_query.query(sql, params, db_query.QueryType.NO_SELECT)return redirect(url_for(login))else:return render_template(error.html,error两次密码输入不一致,)elif request.method GET:return render_template(register.html)app.route(/list)
def movie_list():# 查询数据库获取电影列表movies db_query.fetch_movie_list() # 假设此函数返回一个包含电影信息的列表# 渲染并返回list.html同时传递movies数据return render_template(list.html, login_usernamesession.get(login_username), moviesmovies)app.errorhandler(404)
def page_not_found(error):return render_template(404.html), 404app.errorhandler(500)
def system_error(error):return render_template(500.html), 500if __name__ __main__:# 静态文件缓存自动刷新app.jinja_env.auto_reload Trueapp.run(host127.0.0.1, port8002, debugTrue)首页
!DOCTYPE html
html langenheadmeta charsetutf-8 /meta http-equivX-UA-Compatible contentIEedge /metanameviewportcontentwidthdevice-width, initial-scale1, shrink-to-fitno/meta namedescription content /meta nameauthor content /title首页/title!-- Custom fonts for this template--linkhref/static/vendor/fontawesome-free/css/all.min.cssrelstylesheettypetext/css/linkhrefhttps://fonts.googleapis.com/css?familyNunito:200,200i,300,300i,400,400i,600,600i,700,700i,800,800i,900,900irelstylesheet/!-- Custom styles for this template--link href/static/css/sb-admin-2.min.css relstylesheet //headbody idpage-top!-- Page Wrapper --div idwrapper!-- Sidebar --ulclassnavbar-nav bg-gradient-primary sidebar sidebar-dark accordionidaccordionSidebar!-- Sidebar - Brand --aclasssidebar-brand d-flex align-items-center justify-content-centerhrefindex.htmldiv classsidebar-brand-icon rotate-n-15i classfas fa-laugh-wink/i/divdiv classsidebar-brand-text mx-3豆瓣电影可视化/div/a!-- Divider --hr classsidebar-divider my-0 /!-- Nav Item - Dashboard --li classnav-item activea classnav-link href/i classfas fa-fw fa-tachometer-alt/ispan首页/span/a/li!-- 列表 --li classnav-itema classnav-link href/listi classfas fa-fw fa-table/ispan电影列表/span/a/li!-- Divider --hr classsidebar-divider d-none d-md-block /!-- Sidebar Toggler (Sidebar) --div classtext-center d-none d-md-inlinebutton classrounded-circle border-0 idsidebarToggle/button/div/ul!-- End of Sidebar --!-- Content Wrapper --div idcontent-wrapper classd-flex flex-column!-- Main Content --div idcontent!-- Topbar --navclassnavbar navbar-expand navbar-light bg-white topbar mb-4 static-top shadow!-- Sidebar Toggle (Topbar) --buttonidsidebarToggleTopclassbtn btn-link d-md-none rounded-circle mr-3i classfa fa-bars/i/button!-- Topbar Search --!-- form classd-none d-sm-inline-block form-inline mr-auto ml-md-3 my-2 my-md-0 mw-100 navbar-searchdiv classinput-groupinput typetext classform-control bg-light border-0 small placeholderSearch for... aria-labelSearch aria-describedbybasic-addon2div classinput-group-appendbutton classbtn btn-primary typebuttoni classfas fa-search fa-sm/i/button/div/div/form --!-- Topbar Navbar --ul classnavbar-nav ml-autodiv classtopbar-divider d-none d-sm-block/div!-- Nav Item - User Information --li classnav-item dropdown no-arrowaclassnav-link dropdown-togglehref#iduserDropdownrolebuttondata-toggledropdownaria-haspopuptruearia-expandedfalsespan classmr-2 d-none d-lg-inline text-gray-600 small{{login_username}}/spanimgclassimg-profile rounded-circlesrc/static/img/avatar.png//a!-- Dropdown - User Information --divclassdropdown-menu dropdown-menu-right shadow animated--grow-inaria-labelledbyuserDropdownaclassdropdown-itemhref#data-togglemodaldata-target#logoutModaliclassfas fa-sign-out-alt fa-sm fa-fw mr-2 text-gray-400/iLogout/a/div/li/ul/nav!-- End of Topbar --!-- Begin Page Content --div classcontainer-fluid!-- Page Heading --!-- div classd-sm-flex align-items-center justify-content-between mb-4h1 classh3 mb-0 text-gray-800Dashboard/h1a href# classd-none d-sm-inline-block btn btn-sm btn-primary shadow-smi classfas fa-download fa-sm text-white-50/i Generate Report/a/div --!-- Content Row --div classrow!-- Earnings (Monthly) Card Example --div classcol-xl-3 col-md-6 mb-4div classcard border-left-primary shadow h-100 py-2div classcard-bodydiv classrow no-gutters align-items-centerdiv classcol mr-2divclassfont-weight-bold text-primary text-uppercase mb-1电影总数/divdiv classh5 mb-0 font-weight-bold text-gray-800{{ movie_stats[total_movies] }}/div/divdiv classcol-autoi classfas fa-calendar fa-2x text-gray-300/i/div/div/div/div/div!-- Earnings (Monthly) Card Example --div classcol-xl-3 col-md-6 mb-4div classcard border-left-success shadow h-100 py-2div classcard-bodydiv classrow no-gutters align-items-centerdiv classcol mr-2divclassfont-weight-bold text-success text-uppercase mb-1电影最高评分/divdiv classh5 mb-0 font-weight-bold text-gray-800{{ movie_stats[highest_rating] }}/div/divdiv classcol-autoi classfas fa-dollar-sign fa-2x text-gray-300/i/div/div/div/div/div!-- Earnings (Monthly) Card Example --div classcol-xl-3 col-md-6 mb-4div classcard border-left-info shadow h-100 py-2div classcard-bodydiv classrow no-gutters align-items-centerdiv classcol mr-2divclassfont-weight-bold text-info text-uppercase mb-1出演最多演员/divdiv classrow no-gutters align-items-centerdiv classcol-autodivclassh5 mb-0 mr-3 font-weight-bold text-gray-800{{ movie_stats[most_popular_cast] }}/div/divdiv classcoldiv classprogress progress-sm mr-2divclassprogress-bar bg-inforoleprogressbarstylewidth: 50%aria-valuenow50aria-valuemin0aria-valuemax100/div/div/div/div/divdiv classcol-autoiclassfas fa-clipboard-list fa-2x text-gray-300/i/div/div/div/div/div!-- Pending Requests Card Example --div classcol-xl-3 col-md-6 mb-4div classcard border-left-warning shadow h-100 py-2div classcard-bodydiv classrow no-gutters align-items-centerdiv classcol mr-2divclassfont-weight-bold text-warning text-uppercase mb-1制片最多国家/divdiv classh5 mb-0 font-weight-bold text-gray-800{{ movie_stats[most_common_country] }}/div/divdiv classcol-autoi classfas fa-comments fa-2x text-gray-300/i/div/div/div/div/div/div!-- Content Row --div classrow!-- Area Chart --div classcol-xl-6 col-lg-6div classcard shadow mb-4!-- Card Header - Dropdown --divclasscard-header py-3 d-flex flex-row align-items-center justify-content-betweenh6 classm-0 font-weight-bold text-primary电影分类统计/h6/div!-- Card Body --div classcard-bodydividmovie_type_chartstylewidth: 100%; height: 450px/div!-- div classchart-area/div --/div/div/div!-- Line Chart --div classcol-xl-6 col-lg-6div classcard shadow mb-4!-- Card Header - Dropdown --divclasscard-header py-3 d-flex flex-row align-items-center justify-content-betweenh6 classm-0 font-weight-bold text-primary电影评分统计/h6div classdropdown no-arrowaclassdropdown-togglehref#rolebuttoniddropdownMenuLinkdata-toggledropdownaria-haspopuptruearia-expandedfalseiclassfas fa-ellipsis-v fa-sm fa-fw text-gray-400/i/adivclassdropdown-menu dropdown-menu-right shadow animated--fade-inaria-labelledbydropdownMenuLinkdiv classdropdown-headerDropdown Header:/diva classdropdown-item href#Action/aa classdropdown-item href#Another action/adiv classdropdown-divider/diva classdropdown-item href#Something else here/a/div/div/div!-- Card Body --div classcard-bodydividmovie_score_chartstylewidth: 100%; height: 450px/div/div/div/div/div!-- Content Row --/div!-- /.container-fluid --/div!-- End of Main Content --!-- Footer --footer classsticky-footer bg-whitediv classcontainer my-autodiv classcopyright text-center my-autospanLaoxu Open Source.atarget_blankhrefhttps://github.com/mudfishGithub/a/span/div/div/footer!-- End of Footer --/div!-- End of Content Wrapper --/div!-- End of Page Wrapper --!-- Scroll to Top Button--a classscroll-to-top rounded href#page-topi classfas fa-angle-up/i/a!-- Logout Modal--divclassmodal fadeidlogoutModaltabindex-1roledialogaria-labelledbyexampleModalLabelaria-hiddentruediv classmodal-dialog roledocumentdiv classmodal-contentdiv classmodal-headerh5 classmodal-title idexampleModalLabelReady to Leave?/h5buttonclassclosetypebuttondata-dismissmodalaria-labelClosespan aria-hiddentrue×/span/button/div!-- div classmodal-bodySelect Logout below if you are ready to end your current session./div --div classmodal-footerbuttonclassbtn btn-secondarytypebuttondata-dismissmodalCancel/buttona classbtn btn-primary href/logoutLogout/a/div/div/div/div!-- Bootstrap core JavaScript--script src/static/vendor/jquery/jquery.min.js/scriptscript src/static/vendor/bootstrap/js/bootstrap.bundle.min.js/script!-- Core plugin JavaScript--script src/static/vendor/jquery-easing/jquery.easing.min.js/script!-- Custom scripts for all pages--script src/static/js/sb-admin-2.min.js/script!-- Page level plugins --script src/static/vendor/chart.js/Chart.min.js/script!-- Page level custom scripts --script src/static/js/demo/chart-area-demo.js/scriptscript src/static/js/demo/chart-pie-demo.js/scriptscript src/static/js/echarts.min.js/scriptscriptvar chartDom document.getElementById(movie_type_chart);var myChart echarts.init(chartDom);var option;var movieTypeData {{ movie_type_distribution|tojson }};// console.log(movieTypeData)option {title: {text: ,subtext: 来源豆瓣数据,left: center,},tooltip: {trigger: item,},legend: {orient: vertical,left: left,},series: [{name: Access From,type: pie,radius: 50%,data: movieTypeData,emphasis: {itemStyle: {shadowBlur: 10,shadowOffsetX: 0,shadowColor: rgba(0, 0, 0, 0.5),},},},],};option myChart.setOption(option);/scriptscriptvar chartDom document.getElementById(movie_score_chart);var myChart echarts.init(chartDom);var option;var ratingData {{ movie_rating_distribution|tojson }};console.log(ratingData)option {title: {text: ,subtext: 来源豆瓣数据,left: center,},xAxis: {type: category,boundaryGap: false,data: ratingData.map(item item[0]),},yAxis: {type: value,},series: [{data: ratingData.map(item item[1]),type: line,areaStyle: {},},],tooltip: {trigger: axis, //坐标轴触发主要在柱状图折线图等会使用类目轴的图表中使用axisPointer: {// 坐标轴指示器坐标轴触发有效type: shadow // 默认为直线可选为line | shadow}},};option myChart.setOption(option);/script/body
/html