Loading

DATA

Python爬虫实战(三)爬取并批量下载图片
代码如下:import urllib.request,urllib.error import re from bs...
扫描右侧二维码阅读全文
11
2021/07

Python爬虫实战(三)爬取并批量下载图片

代码如下:

import urllib.request,urllib.error
import re
from bs4 import BeautifulSoup

urls = "https://www.woyaogexing.com/tupian/weimei/index_"
for i in range(1,6):
#拼接链接
    hz =".html"
    x = str(i)
    url_a = urls + x + hz
    # print(url_a)
    url = "https://www.woyaogexing.com/tupian/weimei/index.html"
    print("正在下载第%d页图片" %i)
#第一页单独爬取
    if i == 1:
        requests = urllib.request.Request(url)
        response = urllib.request.urlopen(requests)
        html = response.read().decode("utf-8")
        # print(html)
        findtitle = re.compile(r'<a class="imgTitle" href=".*" target="_blank" .*?>(.*)</a>')
        findlink = re.compile(r'<img class="lazy" height="180" src="(.*)" width="180"/>')

        soup = BeautifulSoup(html, "html.parser")
        datalist = []
        for txList in soup.find_all('div', class_="txList"):
            import requests
            data = []
            txList = str(txList)
            title = re.findall(findtitle, txList)[0]
            title = re.sub("/", "", title)
            # print(title)
            link = re.findall(findlink, txList)[0]
            # print(link)
            url = "http:" + link
            # print(url)
            response = requests.get(url)
            image_hz = url.split(".")[-1]
            image_path = "../download/" + title + "." + image_hz
            # print(image_path)
            with open(image_path,'wb') as f:
                f.write(response.content)
    else:
#爬取剩余页数
        requests = urllib.request.Request(url_a)
        response = urllib.request.urlopen(requests)
        html = response.read().decode("utf-8")
        # print(html)
#正则提取标题和链接
        findtitle = re.compile(r'<a class="imgTitle" href=".*" target="_blank" .*?>(.*)</a>')
        findlink = re.compile(r'<img class="lazy" height="180" src="(.*)" width="180"/>')

        soup = BeautifulSoup(html, "html.parser")
        datalist = []
        for txList in soup.find_all('div', class_="txList"):
            import requests
            data = []
            txList = str(txList)
            title = re.findall(findtitle, txList)
            if (len(title) == 0):
                title = str(1)
            else:
                title = re.findall(findtitle, txList)[0]
#替换标题中不适合作为文件名的符号
            title = re.sub("/", "", title)
            title = re.sub("\|", "", title)
            title = re.sub("“", "", title)
            # print(title)
            link = re.findall(findlink, txList)[0]
            # print(link)
            url_a = "http:" + link
            # print(url_a)
#下载图片
            response = requests.get(url_a)
            image_hz = url_a.split(".")[-1]
            image_path = "../download/" + title + "." + image_hz
            # print(image_path)
            with open(image_path,'wb') as f:
                f.write(response.content)

print("爬取完毕")
'''
# url = "https://www.woyaogexing.com/tupian/weimei/index.html"
requests = urllib.request.Request(url)
response = urllib.request.urlopen(requests)
html = response.read().decode("utf-8")
# print(html)
findtitle = re.compile(r'<a class="imgTitle" href=".*" target="_blank" .*?>(.*)</a>')
findlink = re.compile(r'<img class="lazy" height="180" src="(.*)" width="180"/>')

soup = BeautifulSoup(html,"html.parser")
datalist = []
for txList in  soup.find_all('div',class_="txList"):
    import requests
    # print(txList)
    data = []
    txList = str(txList)
    title = re.findall(findtitle,txList)[0]
    title = re.sub("/","",title)
    # print(title)
    # data.append(title)
    link = re.findall(findlink,txList)[0]
    # print(link)
    url = "http:"+link
    # print(url)
    response = requests.get(url)
    image_hz = url.split(".")[-1]
    image_path = "../download/"+title+"."+image_hz
    # print(image_path)
    # with open(image_path,'wb') as f:
    #     # data.append(link)
    #     f.write(response.content)

print("爬取完毕")
'''
最后修改:2021 年 07 月 11 日 02 : 21 PM
如果觉得我的文章对你有用,请随意赞赏

发表评论