python正则表达式爬取猫眼电影top100

2024-11-30 18:12:28

用正则表达式爬取猫眼电影top100，具体内容如下

#!/usr/bin/python
# -*- coding: utf-8 -*- 

import json  # 快速导入此模块：鼠标先点到要导入的函数处，再Alt + Enter进行选择
from multiprocessing.pool import Pool #引入进程池 

import requests
import re
import csv
from requests.exceptions import RequestException #引入异常 

## 正确保存，无丢失 

# 请求一个页面返回响应内容
#以《霸王别姬》为列，右击—查看元素—会显示一个网页信息
def get_one_page(url,offset):
 try:
  response=requests.get(url=url,params={"offset":offset})
  if response.status_code==200: #由状态码判断返回结果，200表示请求成功，300,500表出错
   return response.text #返回网页内容
  else:return None
 except RequestException as e:
   return None 

# 解析一个页面
def parse_one_page(html):
 pattern = ('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
       + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
       + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>')
 #写个正则，匹配所有结果。这里由上面的网页相应内容写<dd>开头，.*?匹配任意字符穿 board-index匹配标识符,类名，
 # \d 表数字即排名，'+'表示匹配至少一个可多个数字，</i>右边结束符
 #“?”，问号表示 非贪婪匹配，就是一旦匹配到就不在继续往后面尝试。
 #而\(和\)分别表示匹配一个“(”和“)”
 # re.S匹配多行
 regex = re.compile(pattern,re.S) #一个方法，通过一个正则表达式字符串编译生成一个正则表达式对象，re.S 匹配任意字符
 items = regex.findall(html) #以列表形式返回全部能匹配的子串. eg: re.findall(pattern, string[, flags])
 for item in items: #将结果以字典形式返回，键值对
   yield{  #把这个方法变成一个生成器
    'index':item[0],
    'image':item[1],
    'title':item[2],
    'actor':item[3].strip()[3:], #用strip（）去掉换行符，不想要 主演： 这三个字就用[3:]组成一个切片，name就可以将前三个字符串去掉
    'time':get_release_time(item[4].strip()[5:]),  #去掉前五个字符
    'area':get_release_area(item[4].strip()[5:]),
    'score':item[5]+item[6] #将评分整数部分和小数部分结合起来
  } 

'''''
#保存到txt，会发现中文汉字变成了unic的编码,加上encoding='utf-8'，ensure_ascii=False，则汉字可正常输出
def write_to_file(content):
 with open('result.txt','a',encoding='utf-8') as f: # 参数 a ，表示直接往后追加
  f.write(json.dumps(content,ensure_ascii=False) +'\n') #content是一个字典的形式，用json.dumps 把它转换为字符串，再加个换行符
  f.close()
#json.dumps :dict 转换为 str
#json.loads: str 转换为 dict
'''
''''''''
# 获取上映时间 <p class="releasetime">上映时间：1993-01-01(中国香港)</p>
def get_release_time(data):
 pattern = '^(.*?)(\(|$)'
 regex = re.compile(pattern)
 w = regex.search(data)
 return w.group(1) # group(1)指的是第一个括号里的东西 

# 获取上映地区
def get_release_area(data):
 pattern = '.*\((.*)\)' #而\(和\)分别表示匹配一个 '(' 和 ')'
 regex = re.compile(pattern)
 w = regex.search(data)
 if w is None:
  return'未知'
 return w.group(1) 

# 获取封面大图,不需要
# def get_large_thumb(url):
#  pattern = '(.*?)@.*?'
#  regex = re.compile(pattern)
#  w = regex.search(url)
#  return w.group(1) 

# 存储数据
def store_data(item):
 with open('movie.csv','a',newline='',encoding='utf-8') as data_csv:
  # dialect为打开csv文件的方式，默认是excel，delimiter="\t"参数指写入的时候的分隔符
  csv_writer = csv.writer(data_csv)
  csv_writer.writerow([item['index'], item['image'], item['title'], item['actor'],item['time'],item['area'],item['score']])
# 参数newline是用来控制文本模式之下，一行的结束字符。可以是None，''，\n，\r，\r\n等。
'''''
也可判断异常，一般没错
  try:
   csv_writer = csv.writer(data_csv)
   csv_writer.writerow([item['index'], item['image'], item['title'], item['actor'],item['time'],item['area'],item['score']])
  except Exception as e:
   print(e)
   print(item)
''' 

# 下载封面图
#读方式打开的话，并不会新建；写方式打开的话就会新建。 r只读，w可写，a追加
def download_thumb(title,image):
 try:
  response = requests.get(image)
  # 获取二进制数据
  with open('image/'+title+'.jpg', 'wb') as f: #将封面图保存到当前路径下的image文件夹中，图片名称为：电影名.jpg
   f.write(response.content)
   f.close()
 except RequestException as e:
  print(e)
  pass 

# 主调度程序
def main():
 # 起始URL
 start_url = 'http://maoyan.com/board/4?'
 for i in range(0,1000,10):
  # 获取响应文本内容
  html = get_one_page(url=start_url, offset=i)
  if html is None:
   print('链接:%s?offset=%s异常'.format(start_url,i))
   continue
  for item in parse_one_page(html):
   # print(item)
   store_data(item)
   # download_thumb(item['title'],item['image'])
# 

if __name__=='__main__':
 main() 

'''''
if __name__=='__main__':
 for i in range(10):
  main(i*10)
''' 

'''''
if __name__=='__main__':
 for i in range(10):
  main(i*10)
 pool=Pool() #可以提供指定数量的进程供用户调用，如果有一个新的请求被提交到进程池，进程池还没有满，就会创建新的进程来执行请求，如果满了，就先等待
 pool.map(main,[i*10 for i in range(10)]) #将数组中的每一个元素拿出来当做函数的参数，然后创建一个个的进程，放到进程池里面去运行；第二个参数是构造一个数组，组成循环
 #速度明显变快！1s
'''

保存到数据库

def main(offset):
  url='http://maoyan.com/board/4?offset='+str(offset)
  html=get_one_page(url)
  # for item in parse_one_page(html):
  #   print(item['number'])  #能正确输出 , charset="utf8"
  try:
    conn = pymysql.connect(host='localhost', user='root', passwd=' ', port=3306,db='test1',charset="utf8",use_unicode = False )
    cur = conn.cursor() # 创建一个游标对象
    for item in parse_one_page(html):
      try:
        # sql = "INSERT INTO movies (number,picture,title,actors,time,area,score) VALUES （%s,%s,%s,%s,%s,%s,%s）"
        # cur.execute(sql, ( item['number'],item['picture'],item['title'],item['actors'],item['time'],item['area'],item['score']))
        sql = "insert into test_movies (number,picture,title,actors,time,area,score) values(%s,%s,%s,%s,%s,%s,%s)"
        cur.execute(sql, (item['number'], item['picture'], item['title'], item['actors'], item['time'], item['area'],item['score']))
      except pymysql.Error as e:
        print(e)
      print('- - - - - 数据保存成功 - - - - -')
    conn.commit()
    cur.close()
    conn.close() # 关闭数据
  except pymysql.Error as e:
    print("Mysql Error %d: %s" % (e.args[0], e.args[1])) 

if __name__=='__main__':
  # 连接数据库
  conn = pymysql.connect(host='localhost', user='root', passwd=' ', port=3306, db='test1', charset="utf8")
  cur = conn.cursor() # 创建一个游标对象
  cur.execute("DROP TABLE IF EXISTS test_movies") # 如果表存在则删除
  # 创建表sql语句
  sqlc = """CREATE TABLE test_movies(
    number int not null primary key auto_increment,
    picture VARCHAR(100) NOT NULL,
    title VARCHAR(100) NOT NULL,
    actors VARCHAR(200) NOT NULL,
    time VARCHAR(100) NOT NULL,
    area VARCHAR(100) ,
    score VARCHAR(50) NOT NULL
  )"""
  cur.execute(sqlc) # 执行创建数据表操作
  pool=Pool()
  pool.map(main,[i*10 for i in range(10)])

以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持我们。

您可能感兴趣的文章:

python爬虫正则表达式使用技巧及爬取个人博客的实例讲解
Python使用Selenium+BeautifulSoup爬取淘宝搜索页
python3爬取各类天气信息
使用Python爬取最好大学网大学排名
python爬取淘宝商品详情页数据
python3爬取淘宝信息代码分析
python爬虫爬取某站上海租房图片
python爬取拉勾网职位数据的方法
Python爬虫爬取一个网页上的图片地址实例代码
python爬虫爬取淘宝商品信息（selenum+phontomjs）

python爬虫正则表达式使用技巧及爬取个人博客的实例讲解

这篇博客是自己<数据挖掘与分析>课程讲到正则表达式爬虫的相关内容,主要简单介绍Python正则表达式爬虫,同时讲述常见的正则表达式分析方法,最后通过实例爬取作者的个人博客网站.希望这篇基础文章对您有所帮助,如果文章中存在错误或不足之处,还请海涵.真的太忙了,太长时间没有写博客了,抱歉~ 一.正则表达式正则表达式(Regular Expression,简称Regex或RE)又称为正规表示法或常规表示法,常常用来检索.替换那些符合某个模式的文本,它首先设定好了一些特殊的字及字符组合,通过组合的&
Python爬虫爬取一个网页上的图片地址实例代码

本文实例主要是实现爬取一个网页上的图片地址,具体如下. 读取一个网页的源代码: import urllib.request def getHtml(url): html=urllib.request.urlopen(url).read() return html print(getHtml(http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E5%A3%81%E7%BA%B8&ct=201326592&am
python3爬取淘宝信息代码分析

# encoding:utf-8 import re # 使用正则匹配想要的数据 import requests # 使用requests得到网页源码这个函数是用来得到源码 # 得到主函数传入的链接 def getHtmlText(url): try: # 异常处理 # 得到你传入的URL链接设置超时时间3秒 r = requests.get(url, timeout=3) # 判断它的http状态码 r.raise_for_status() # 设置它的编码 encoding是设置它的头
使用Python爬取最好大学网大学排名

本文实例为大家分享了Python爬取最好大学网大学排名的具体代码,供大家参考,具体内容如下源代码: #-*-coding:utf-8-*- ''''' Created on 2017年3月17日 @author: lavi ''' import requests from bs4 import BeautifulSoup import bs4 def getHTMLText(url): try: r = requests.get(url) r.raise_for_status r.encodi
Python使用Selenium+BeautifulSoup爬取淘宝搜索页

使用Selenium驱动chrome页面,获得淘宝信息并用BeautifulSoup分析得到结果. 使用Selenium时注意页面的加载判断,以及加载超时的异常处理. import json import re from bs4 import BeautifulSoup from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.com
python爬取拉勾网职位数据的方法

今天写的这篇文章是关于python爬虫简单的一个使用,选取的爬取对象是著名的招聘网站--拉钩网,由于和大家的职业息息相关,所以爬取拉钩的数据进行分析,对于职业规划和求职时的信息提供有很大的帮助. 完成的效果爬取数据只是第一步,怎样使用和分析数据也是一大重点,当然这不是本次博客的目的,由于本次只是一个上手的爬虫程序,所以我们的最终目的只是爬取到拉钩网的职位信息,然后保存到Mysql数据库中.最后中的效果示意图如下: 控制台输入数据库显示准备工作首先需要安装python,这个网上已经有很多的
python爬虫爬取某站上海租房图片

对于一个net开发这爬虫真真的以前没有写过.这段时间开始学习python爬虫,今天周末无聊写了一段代码爬取上海租房图片,其实很简短就是利用爬虫的第三方库Requests与BeautifulSoup.python 版本:python3.6 ,IDE :pycharm.其实就几行代码,但希望没有开发基础的人也能一下子看明白,所以大神请绕行. 第三方库首先安装我是用的pycharm所以另为的脚本安装我这就不介绍了. 如上图打开默认设置选择Project Interprecter,双击pip或者点击加
python爬虫爬取淘宝商品信息（selenum+phontomjs）

本文实例为大家分享了python爬虫爬取淘宝商品的具体代码,供大家参考,具体内容如下 1.需求目标 : 进去淘宝页面,搜索耐克关键词,抓取商品的标题,链接,价格,城市,旺旺号,付款人数,进去第二层,抓取商品的销售量,款号等. 2.结果展示 3.源代码 # encoding: utf-8 import sys reload(sys) sys.setdefaultencoding('utf-8') import time import pandas as pd time1=time.time()
python3爬取各类天气信息

本来是想从网上找找有没有现成的爬取空气质量状况和天气情况的爬虫程序,结果找了一会儿感觉还是自己写一个吧. 主要是爬取北京包括北京周边省会城市的空气质量数据和天气数据. 过程中出现了一个错误:UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 250. 原来发现是页面的编码是gbk,把语句改成data=urllib.request.urlopen(url).read().decode("gbk")就可以
python爬取淘宝商品详情页数据

在讲爬取淘宝详情页数据之前,先来介绍一款 Chrome 插件:Toggle JavaScript (它可以选择让网页是否显示 js 动态加载的内容),如下图所示: 当这个插件处于关闭状态时,待爬取的页面显示的数据如下: 当这个插件处于打开状态时,待爬取的页面显示的数据如下: 可以看到,页面上很多数据都不显示了,比如商品价格变成了划线价格,而且累计评论也变成了0,说明这些数据都是动态加载的,以下演示真实价格的找法(评论内容找法类似),首先检查页面元素,然后点击Network选项卡,刷新页面,可

python正则表达式爬取猫眼电影top100

您可能感兴趣的文章:

相关推荐

随机推荐