小白爬虫之爬虫快跑,三十六线程下载表情包

作者: 网络编程  发布:2019-11-03

5.2.劳动者和买主

Lock情势的坐褥者和用户

import threading
import random,time

gMoney = 1000
gLock = threading.Lock()
gTotalTimes = 10
gTimes = 0


class Producer(threading.Thread):
    def run(self):
        global gMoney
        global gTimes
        while True:
            money = random.randint(100,1000)
            gLock.acquire()
            #只生产10次,超过就停止,必须把锁给释放掉,否则产生死锁
            if gTimes >= gTotalTimes:
                gLock.release()
                break
            gMoney  = money
            print('%s生产了%d元钱,剩余%d元钱' % (threading.current_thread(), money, gMoney))
            #生产一次,次数加1,总共10次
            gTimes  = 1
            gLock.release()
            time.sleep(0.5)


class Consumer(threading.Thread):
    def run(self):
        global gMoney
        while True:
            money = random.randint(100,1000)
            gLock.acquire()
            if gMoney >= money:
                gMoney -= money
                print('%s消费了%d元钱,剩余%d元钱' % (threading.current_thread(), money,gMoney))
            else:
                if gTimes >= gTotalTimes:
                    gLock.release()
                    break
            gLock.release()
            time.sleep(0.5)


def main():
    for x in range(5):
        t1 = Producer()
        t1.start()

    for x in range(2):
        t2 = Consumer()
        t2.start()

if __name__ == '__main__':
    main()

os.makedirs(os.path.join("D:mzitu",path))

5.1.threading模块

简短利用

import threading,time

def coding():
    for x in range(3):
        print('正在写代码%s'%x)
        time.sleep(2)

def drawing():
    for x in range(3):
        print('正在画画%s'%x)
        time.sleep(2)

def main():
    t1 = threading.Thread(target=coding)
    t2 = threading.Thread(target=drawing)
    t1.start()
    t2.start()

if __name__ == '__main__':
    main()

print(u'建了八个名字叫做',path,u'的文本夹!')

5.3.下载表情包

网址:

解析:xpath

毫无多线程,速度相对会比较慢

import requests
from lxml import etree
from urllib import request
import os
import re

def parse_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
        'Referer': 'https://movie.douban.com/'
    }
    response = requests.get(url,headers=headers)
    text = response.text
    html = etree.HTML(text)
    imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
    for img in imgs:
        # print(etree.tostring(img))
        #图片地址
        img_url = img.get('data-original')
        #图片名字
        alt = img.get('alt')
        #替换掉名字里面的特殊字符
        alt = re.sub(r'[??.,。!!*]','',alt)
        #获取图片的后缀名(.gif .jpg)
        suffix = os.path.splitext(img_url)[1]
        #保存的时候完整的图片名字
        filename = alt   suffix
        request.urlretrieve(img_url,'C:/Users/Administrator/Desktop/images/' filename)

def main():
    for x in range(1,10):
        url = 'http://www.doutula.com/photo/list/?page=%d'%x
        parse_page(url)

if __name__ == '__main__':
    main()

行使三十二线程

 main()

  • 概念四个连串,和开创四线程
  • page_queue():贮存每生机勃勃页的url
  • img_queue():贮存每大器晚成页里面全体的神情的url

Producer()

  • 从page_queue()队列中去每风度翩翩页的url,直到队列为空则break
  • 用xpath提收取每一页的具备图片的url
  • 把每种图片的url和名字存放到img_queue()队列之中

Consumer()

  • 从img_queue()队列中抽取图片的url和名字
  • 下载保存
  • 直到page_queue()和img_queue()七个系列都为空则break

代码

import requests
from lxml import etree
from urllib import request
import os
import re
import threading
from queue import Queue

class Producer(threading.Thread):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
        'Referer': 'https://movie.douban.com/'
    }

    def __init__(self, page_queue, img_queue, *args, **kwargs):
        super(Producer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.parse_page(url)

    def parse_page(self,url):
        response = requests.get(url,headers=self.headers)
        text = response.text
        html = etree.HTML(text)
        imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
        for img in imgs:
            # print(etree.tostring(img))
            #图片地址
            img_url = img.get('data-original')
            #图片名字
            alt = img.get('alt')
            #替换掉名字里面的特殊字符
            alt = re.sub(r'[??.,。!!*]','',alt)
            #获取图片的后缀名(.gif .jpg)
            suffix = os.path.splitext(img_url)[1]
            #保存的时候完整的图片名字
            filename = alt   suffix
            self.img_queue.put((img_url,filename))


class Consumer(threading.Thread):
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super(Consumer, self).__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        while True:
            if self.img_queue.empty() and self.page_queue.empty():
                break
            img_url,filename = self.img_queue.get()
            request.urlretrieve(img_url, 'C:/Users/Administrator/Desktop/images/'   filename)
            print("已下载完一张图片")


def main():
    page_queue = Queue(1000)
    img_queue = Queue(10000)

    for x in range(1,1758):
        url = 'http://www.doutula.com/photo/list/?page=%d'%x
        page_queue.put(url)

    for x in range(10):
        t = Producer(page_queue,img_queue)
        t.start()

    for x in range(10):
        t = Consumer(page_queue,img_queue)
        t.start()

if __name__ == '__main__':
    main()

结果:

图片 1

 

frombs4importBeautifulSoup

whilethreadsorcrawl_queue:

self.Client=self.client[db]

forpageinrange(1,int(max_span) 1):

spider_queue.push(url,title)

本条函数,笔者的精晓是即使上边包车型大巴发布为真,则整个类为真

self.repair()

frompymongoimportMongoClient,errors

使用四线程时相近在目录切换的主题材料上设有难题,能够给线程加个锁试试

update={'$set':{'status':self.PROCESSING,'timestamp':datetime.now()}}

defpageurl_crawler():

threads.append(thread)##加多进线程队列

self.db=self.Client[collection]

好了!先来理一下思路:

save(img_url)

fromDownloadimportrequest

上面正是多进度 多线程的下载代码了:

returnrecord['_id']

p=multiprocessing.Process(target=mzitu_crawler)##成立进程

exceptKeyError:

defpeek(self):

defcomplete(self,url):

def__init__(self,db,collection,timeout=300):##初始mongodb连接

ifrecord:

fromDownloadimportrequest

thread.start()##启航空线程

本人测验了生龙活虎晃柒分钟下载100套图

self.timeout=timeout

returnTrueifrecordelseFalse

try:

Soup=BeautifulSoup(response.text,'lxml')

{'status':{'$ne':self.COMPLETE}}

def__bool__(self):

并返回_id(就是大家的URL卡塔尔国,MongDB好使吧,^_^

转载自:静觅»小白爬虫第四弹之爬虫快跑(多进程 四线程卡塔尔

img_url=BeautifulSoup(request.get(page_url,3).text,'lxml').find('div',class_='main-image').find('img')['src']

excepterrors.DuplicateKeyErrorase:##报错则意味着已经存在于队列之中了

##img_queue = MogoQueue('meinvxiezhenji', 'img_queue')

self.db.insert({'_id':title,'statue':self.OUTSTANDING,'url':url})

)

if__name__=="__main__":

当然还想下载图片那一同增多异步(究竟下载图片是I\O等待最久的小时了,卡塔 尔(阿拉伯语:قطر‎,可惜异步小编也没怎么整精通,就不拿出去韩门献丑了。

classMogoQueue():

whilelen(threads)orcrawl_queue.peek():##线程池中的线程少于max_threads 或者 crawl_qeue时

self.db.insert({'_id':url,'status':self.OUTSTANDING,'主题':title})

crawl_queue.complete(url)##安装为实现情状

上边是队列的代码:

defrepair(self):

process=[]

p.start()##开行进度

threads.remove(thread)

excepterrors.DuplicateKeyErrorase:

time.sleep(SLEEP_TIME)

all_a=Soup.find('div',class_='all').find_all('a')

importos

whileTrue:

以此有着代码小编放在此个任务了:

ifrecord:

"""

defpush_imgurl(self,title,url):

threads=[]

start('')

else:

response=request.get(url,3)

Python

os.chdir('D:mzitu\' title)

else:

pass

defclear(self):

forthreadinthreads:

譬如未有OUTSTANDING的值则调用repair()函数重新初始化全部超时的情事为OUTSTANDING,

record=self.db.find_and_modify(

frommongodb_queueimportMogoQueue

print('重置URL状态',record['_id'])

img_urls.append(img_url)

if__name__=="__main__":

self.client=MongoClient()

)

print('图片地址插入成功')

这几个函数会询问队列中的全部情况为OUTSTANDING的值,

update={'$set':{'status':self.OUTSTANDING}}

isExists=os.path.exists(os.path.join("D:mzitu",path))

url=crawl_queue.pop()

至于有何用,前面笔者会评释的(假诺本身的精晓有误,请辅导出来谢谢,我也是Python新手卡塔 尔(阿拉伯语:قطر‎

thread=threading.Thread(target=pageurl_crawler)##创办线程

record=self.db.find_one(

img=request.get(img_url,3)

而是为了简化本领项目啊!(才不会报告你们是笔者懒,嫌麻烦呢!卡塔尔国此次我们世襲利用MongoDB。

record=self.db.find_one({'status':self.OUTSTANDING})

self.db.update({'_id':url},{'$set':{'status':self.COMPLETE}})

OUTSTANDING=1##千帆竞发状态

threads 或者 crawl_queue为真都意味着大家还未下载完毕,程序就能够继续实践

fromdatetimeimportdatetime,timedelta

理之当然,芝麻HTTP会一直提供互联网能源支撑。

下边开整Go Go Go!

COMPLETE=3##下载实现情形

process.append(p)##添加进进程队列

returnrecord['_id']

self.db.drop()

本文由金沙澳门官网发布于网络编程,转载请注明出处:小白爬虫之爬虫快跑,三十六线程下载表情包

关键词: 金沙澳门官网