用户登录
用户注册

分享至

Python批量抓取图片

  • 作者: 石林22632073
  • 来源: 51数据库
  • 2022-08-12
# -*- coding:utf-8 -*-
# coding=UTF-8

import os,urllib,urllib2,re

url = u"http://www.51sjk.com/Upload/Articles/1/0/320/320464_20220812154329571.jpg"
outpath = "t:\\"

def getHtml(url):
    webfile = urllib.urlopen(url)
    outhtml = webfile.read()
    print outhtml
    return outhtml

def getImageList(html):
    restr=ur'('
    restr+=ur'http:\/\/[^\s,"]*\.jpg'
    restr+=ur'|http:\/\/[^\s,"]*\.jpeg'
    restr+=ur'|http:\/\/[^\s,"]*\.png'
    restr+=ur'|http:\/\/[^\s,"]*\.gif'
    restr+=ur'|http:\/\/[^\s,"]*\.bmp'
    restr+=ur'|https:\/\/[^\s,"]*\.jpeg'    
    restr+=ur'|https:\/\/[^\s,"]*\.jpeg'
    restr+=ur'|https:\/\/[^\s,"]*\.png'
    restr+=ur'|https:\/\/[^\s,"]*\.gif'
    restr+=ur'|https:\/\/[^\s,"]*\.bmp'
    restr+=ur')'
    htmlurl = re.compile(restr)
    imgList = re.findall(htmlurl,html)
    print imgList
    return imgList

def download(imgList, page):
    x = 1
    for imgurl in imgList:
        filepathname=str(outpath+'pic_%09d_%010d'%(page,x)+str(os.path.splitext(urllib2.unquote(imgurl).decode('utf8').split('/')[-1])[1])).lower()
        print '[Debug] Download file :'+ imgurl+' >> '+filepathname
        urllib.urlretrieve(imgurl,filepathname)
        x+=1

def downImageNum(pagenum):
    page = 1
    pageNumber = pagenum
    while(page <= pageNumber):
        html = getHtml(url)#获得url指向的html内容
        imageList = getImageList(html)#获得所有图片的地址,返回列表
        download(imageList,page)#下载所有的图片
        page = page+1

if __name__ == '__main__':
    downImageNum(1)


软件
前端设计
程序设计
Java相关