python实现爬虫下载漫画示例

代码如下:

#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2

weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6

if len(sys.argv)>=3:
  weburl=sys.argv[1]
  floder=sys.argv[2]
else:
    print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6")
    sys.exit(0)
if len(sys.argv)>=4:
  chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
  threadcount=(int)(sys.argv[4])

def jin(i,jinzhi):
        finalans=""
        answer=i%jinzhi
        i=int(i/jinzhi)
        if answer>9:
                finalans=finalans+chr(ord('a')+(answer-10))
        else:
                finalans=finalans+str(answer)
        if i!=0:
                finalans=jin(i,jinzhi)+finalans
        return finalans
def urlparse(p,a,c,k):
        d={}
        e=lambda c:     jin(c,36)
        if 1:
                while c:
                        c=c-1
                        if not k[c]:
                                d[jin(c,36)]=jin(c,36)
                        else:
                                d[jin(c,36)]=k[c]
                k=[lambda e:d[e]]
                e=lambda c:'\\w+'
                c=1
        newstr=""
        while c:
                c=c-1
                if k[c]:
                        for i in range(0,len(p)):
                                tempi=p[i]
                                tempi=ord(tempi)
                                if tempi>=ord('a') and tempi<=ord('f'):
                                        newstr+=d[chr(tempi)]
                                elif tempi>=ord('0') and tempi<=ord('9'):
                                        newstr+=d[chr(tempi)]
                                else:
                                        newstr+=chr(tempi)
        return newstr
def meispower(s):
        p=re.compile(r"(?=\}\().*",re.IGNORECASE)
        s=p.findall(s)
        s=s[0]
        s=s[0:(len(s)-19)]
        par=s.split(',')
        par[3]=par[3][1:len(par[3])]
        answer=par[3].split('|')
        chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
        allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
        allurl=allurl[10:(len(allurl)-2)]
        return allurl
def pictofile(weburl,filename,loop=100):
        if loop<0:
                print('can\'t download the picture %s'%weburl)
                return
        loop=loop-1
        if os.path.exists(filename):
            return
        try:
                url=urllib.request.urlopen(weburl)
                data=url.read()
                if len(data)<2048:
                        url.close()
                        pictofile(weburl,filename,loop)
                else:
                        print('download from %s name is %s\n'%(weburl,filename))
                        myfile=open('%s'%filename,'wb')
                        myfile.write(data)
                        myfile.close()
                        url.close();
        except socket.timeout:
                print('timeout')
                pictofile(weburl,filename,loop)
        except Exception as e:
          print('error',e)
          pictofile(weburl,filename,loop)
        finally:
            pass
def downloadpic(url,loadpicdir,num):
    #download the all url picture to loadpicdir
    global currentthreadnum,mutex,mutex2
    mymode=re.compile(r'[0-9a-z.]*\Z')
    try:
                mutex2.acquire()
                os.chdir(loadpicdir)
                mutex2.release()
    except:
                print("can't open the floder %s will be create"%loadpicdir)
                try:
                    if(mutex2.locked()):
                        os.mkdir(loadpicdir)
                        os.chdir(loadpicdir)
                        mutex2.release()
                    print('create floder succeed')
                except:
                    print("can't create floder %s"%loadpicdir)
                    if(mutex.acquire()):
                        mutex.release()
                    quit(0)
    name=mymode.findall(url)
    filename='manhua'+name[0]
    pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
    mutex.acquire()
    currentthreadnum=currentthreadnum-1
    mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
        global manhuaweb,threadcount,currentthreadnum,mutex
        print(manhuaweb+url)
        webdata=urllib.request.urlopen(manhuaweb+url).read()
        webdata=webdata.decode('UTF-8')
        chaptername=re.findall(r'<title>[^_]*',webdata)[0]
        chaptername=chaptername[7:len(chaptername)]
        webscrip=re.findall(r'eval.*[^<>]',webdata)
        chapterurl=meispower(webscrip[0]);
        chapterurl='http://mhimg.ali213.net'+chapterurl
        for i in range(begin,num):
                try:
                        while(currentthreadnum>=threadcount):
                                time.sleep(0.5)
                        mutex.acquire()
                        currentthreadnum=currentthreadnum+1
                        mutex.release()
                        threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()
                except socket.error:
                        mutex.acquire()
                        i=i-1
                        currentthreadnum=currentthreadnum-1
                        mutex.release()
                except Exception as error:
                        print(error,'break')
                        print('download chapter %d of picture make a error'%i)
                        break
if __name__=='__main__':
        manhuaweb=r'http://manhua.ali213.net'
        socket.setdefaulttimeout(60.0)
        mutex=threading.Lock()
        mutex2=threading.Lock()

webfile=urllib.request.urlopen(weburl)
        webdata=webfile.read();
        webdata=webdata.decode('UTF-8')
        meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>')
        meshdata=meshmode.findall(webdata)[0]
        indexmode=re.compile(r'([0-9]*页)')
        indexdata=indexmode.findall(meshdata)

picurlmode=re.compile(r'/comic/[0-9/]*.html')
        picurldata=picurlmode.findall(meshdata)

chapterlength=len(picurldata)
        nummode=re.compile(r'[\d]+')

i=chapterbegin
        while i<chapterlength:
                manhuachapter=picurldata[chapterlength-i-1]
                downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))
                i=i+1

(0)

相关推荐

  • python实现爬虫下载美女图片

    本次爬取的贴吧是百度的美女吧,给广大男同胞们一些激励 在爬取之前需要在浏览器先登录百度贴吧的帐号,各位也可以在代码中使用post提交或者加入cookie 爬行地址:http://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=0 #-*- coding:utf-8 -*- import urllib2 import re import requests from lxml import etree 这些是要导入的库,代码并没有使用正则

  • python实现支持目录FTP上传下载文件的方法

    本文实例讲述了python实现支持目录FTP上传下载文件的方法.分享给大家供大家参考.具体如下: 该程序支持ftp上传下载文件和目录.适用于windows和linux平台. #!/usr/bin/env python # -*- coding: utf-8 -*- import ftplib import os import sys class FTPSync(object): conn = ftplib.FTP() def __init__(self,host,port=21): self.c

  • Python实现批量下载文件

    Python实现批量下载文件 #!/usr/bin/env python # -*- coding:utf-8 -*- from gevent import monkey monkey.patch_all() from gevent.pool import Pool import requests import sys import os def download(url): chrome = 'Mozilla/5.0 (X11; Linux i86_64) AppleWebKit/537.36

  • Python3访问并下载网页内容的方法

    本文实例讲述了Python3访问并下载网页内容的方法.分享给大家供大家参考.具体如下: #!/usr/local/bin/python3.2 import urllib.request,io,os,sys req = urllib.request.Request("http://www.google.com") f = urllib.request.urlopen(req) s = f.read() s = s.decode('gbk','ignore') mdir = sys.pat

  • 用Python实现一个简单的能够上传下载的HTTP服务器

    #!/usr/bin/env python #coding=utf-8 # modifyDate: 20120808 ~ 20120810 # 原作者为:bones7456, http://li2z.cn/ # 修改者为:decli@qq.com # v1.2,changeLog: # +: 文件日期/时间/颜色显示.多线程支持.主页跳转 # -: 解决不同浏览器下上传文件名乱码问题:仅IE,其它浏览器暂时没处理. # -: 一些路径显示的bug,主要是 cgi.escape() 转义问题 #

  • 使用Python编写简单网络爬虫抓取视频下载资源

    我第一次接触爬虫这东西是在今年的5月份,当时写了一个博客搜索引擎,所用到的爬虫也挺智能的,起码比电影来了这个站用到的爬虫水平高多了! 回到用Python写爬虫的话题. Python一直是我主要使用的脚本语言,没有之一.Python的语言简洁灵活,标准库功能强大,平常可以用作计算器,文本编码转换,图片处理,批量下载,批量处理文本等.总之我很喜欢,也越用越上手,这么好用的一个工具,一般人我不告诉他... 因为其强大的字符串处理能力,以及urllib2,cookielib,re,threading这些

  • python批量下载壁纸的实现代码

    复制代码 代码如下: #! /usr/bin/env python ##python2.7-批量下载壁纸 ##壁纸来自桌酷网站,所有权归属其网站 ##本代码仅做为交流学习使用,请勿用于商业用途,否则后果自负 ##Code by Dreamlikes import re,urllib,urllib2 #保存图片的路径 savepath = 'd:\\picture\\' #壁纸集合的URL,如下 url = 'http://www.zhuoku.com/zhuomianbizhi/game-gam

  • python批量下载图片的三种方法

    有三种方法,一是用微软提供的扩展库win32com来操作IE,二是用selenium的webdriver,三是用python自带的HTMLParser解析.win32com可以获得类似js里面的document对象,但貌似是只读的(文档都没找到).selenium则提供了Chrome,IE,FireFox等的支持,每种浏览器都有execute_script和find_element_by_xx方法,可以方便的执行js脚本(包括修改元素)和读取html里面的元素.不足是selenium只提供对py

  • python爬虫教程之爬取百度贴吧并下载的示例

    测试url:http://tieba.baidu.com/p/27141123322?pn=begin  1end   4 复制代码 代码如下: import string ,urllib2 def baidu_tieba(url,begin_page,end_page):    for i in range(begin_page, end_page+1):        sName = string.zfill(i,5)+ '.html'        print '正在下载第' + str(

  • 利用python写个下载teahour音频的小脚本

    前言 最近空闲的时候看到了之前就关注的一个小站http://teahour.fm/,一直想把这里的音频都听一遍,可转眼间怎么着也有两年了,却什么也没做.有些伤感,于是就写了个脚本,抓了下音频的下载链接,等下载下来后一定要认真听听. 时间仓促,加调试也就那么十几分钟,脚本写的可能有些烂,大家可以留言指出. teahour.py #!/usr/bin/env python #coding: utf-8 import sys import requests from BeautifulSoup imp

随机推荐