Jsoup初接觸-發一個Jsoup抓取圖片的程序

 

主要有兩個線程:圖片url抓取線程、圖片下載保存線程。

 

圖片下載保存采用線程池處理,主要利用java的ThreadPoolExecutor實現。

 

url抓取線程:

 

 

 

package sys.gifspider;

 

import java.io.IOException;

import java.util.Properties;

import java.util.concurrent.BlockingQueue;

import java.util.concurrent.LinkedBlockingQueue;

import java.util.concurrent.ThreadPoolExecutor;

import java.util.concurrent.TimeUnit;

 

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

 

import sys.gifspider.utils.PropertyUtil;

 

public class GifSpider implements Runnable

{

 

  volatile boolean isRunning = true;

  private ThreadPoolExecutor threadPool;

  BlockingQueue queue;

  

  public GifSpider(BlockingQueue queue)

  {

    this.queue = queue;

    this.init();

  }

  

  /**

   * 線程池初始化

   */

  private void init()

  {

    Properties pro = PropertyUtil.getProperties();

    int corePoolSize = Integer.parseInt(pro.getProperty(threadpool.corePoolSize));

    int maxPoolSize = Integer.parseInt(pro.getProperty(threadpool.maxPoolSize));

    int keepAliveSeconds = Integer.parseInt(pro.getProperty(threadpool.keepAliveSeconds));

    int queueCap = Integer.parseInt(pro.getProperty(threadpool.queueCapacity));

    BlockingQueue queue = new LinkedBlockingQueue(queueCap);

    this.threadPool = new ThreadPoolExecutor(

        corePoolSize, maxPoolSize, keepAliveSeconds, TimeUnit.SECONDS, 

        queue);

  }

  public boolean isRunning()

  {

    return isRunning;

  }

 

  public void setRunning(boolean isRunning)

  {

    this.isRunning = isRunning;

  }

 

  @Override

  public void run()

  {

    while (this.isRunning)

    {

      try

      {

        

        String url = this.queue.take();

        System.out.println(請求url: + url);

        Document doc = Jsoup.connect(url).get();

        //獲取所有

        Elements s = doc.select(p.pic_list2).first().select(a[href]);

        for (Element e : s)

        {

          //有img 和  文字 兩種href,指向相同德圖片,隻過濾圖片href就行瞭

          Elements s1 = e.select(img);

          if (s1.size() != 0)

          {

            String imgUrl = e.absUrl(href);

            String text = s1.attr(alt);

            Document doc1 = Jsoup.connect(imgUrl).get();

            Elements e1 = doc1.getElementById(endtext).select(img);

            //網頁源碼中是相對路徑,要獲取絕對路徑

            String realUrl = e1.attr(abs:src);

            System.out.println(獲取圖片url: + realUrl);

            //獲取到圖片url,扔給線程池處理

            GifProcessor pro = new GifProcessor(text,realUrl);

            this.threadPool.execute(pro);

          }

          

        }

        Thread.sleep(1000);

      } catch (InterruptedException e)

      {

        e.printStackTrace();

      } catch (IOException e)

      {

        e.printStackTrace();

      }

    }

    

  }

  

}

 

圖片處理線程很簡單,就是圖片下載和保存:

package sys.gifspider;

 

import sys.gifspider.utils.FileProcessor;

 

public class GifProcessor implements Runnable

{

 

  private String imgName;

  private String imgUrl;

  

  public GifProcessor(String name,String url)

  {

    this.imgName = name;

    this.imgUrl = url;

  }

  @Override

  public void run()

  {

    FileProcessor fp = new FileProcessor(this.imgName,this.imgUrl);

    try

    {

      System.out.println(下載保存圖片url:+this.imgUrl);

      fp.saveGif();

      

    }catch(Exception e)

    {

      System.out.println(下載保存圖片失敗,url:+this.imgUrl);

      e.printStackTrace();

    }

    

  }

  

}

 

 

 

 

下載保存:

 

 

package sys.gifspider.utils;

 

import java.io.BufferedOutputStream;

import java.io.ByteArrayOutputStream;

import java.io.File;

import java.io.FileOutputStream;

import java.io.InputStream;

import java.net.HttpURLConnection;

import java.net.URL;

 

public class FileProcessor

{

  private String imgName;

  private String imgUrl;

  

  public FileProcessor(String name,String url)

  {

    this.imgName = name;

    this.imgUrl = url;

  }

  

  /**

   * 保存路徑,不存在就創建

   * @return

   */

  private String makeDir()

  {

    String strdir = PropertyUtil.getProperties().getProperty(dir);

    File dir = new File(strdir);

    if (!dir.exists())

    {

      dir.mkdir();

    }

    return strdir;

  }

  

  /**

   * 保存

   * @throws Exception

   */

  public void saveGif() throws Exception

  {

    String dir = makeDir();

    String file = dir + this.imgName + this.imgUrl.substring(this.imgUrl.lastIndexOf(.));

    BufferedOutputStream out = null;

    byte[] bit = this.download();

    if (bit.length > 0)

    {

      try

      {

        out = new BufferedOutputStream(new FileOutputStream(file));

        out.write(bit);

        out.flush();

      } finally

      {

        if (out != null)

          out.close();

      }

    }

  }

  /**

   * 下載

   * @return

   * @throws Exception

   */

  private byte[] download() throws Exception  

  {

    URL url = new URL(this.imgUrl);  

    HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();  

    httpConn.connect();  

    InputStream cin = httpConn.getInputStream();  

    ByteArrayOutputStream outStream = new ByteArrayOutputStream();  

    byte[] buffer = new byte[1024];  

    int len = 0;  

    while ((len = cin.read(buffer)) != -1) {  

        outStream.write(buffer, 0, len);  

    }  

    cin.close();  

    byte[] fileData = outStream.toByteArray();  

    outStream.close();  

    return fileData;  

  }

}

 

程序入口如下:

 

 

 

 

package sys.gifspider;

 

import java.io.IOException;

import java.util.Properties;

import java.util.concurrent.BlockingQueue;

import java.util.concurrent.LinkedBlockingQueue;

 

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

 

import sys.gifspider.utils.PropertyUtil;

 

public class Main

{

  public static void main(String[] args)

  {

    init();

    

  }

  public static void init()

  {

    Properties pro = PropertyUtil.getProperties();

    int startPage = Integer.parseInt(pro.getProperty(startPage));

    int endPage = Integer.parseInt(pro.getProperty(endPage));

    String url = pro.getProperty(url);

    int count = endPage – startPage +1;

    BlockingQueue queue = new LinkedBlockingQueue(count);

    for (int i = 1; i <= count; i++)

    {

      queue.add(String.format(url, i));

    }

    int spiderCount = Integer.parseInt(pro.getProperty(spiderThread));

    for (int i = 0; i < spiderCount; i++)

    {

      GifSpider spider = new GifSpider(queue);

      Thread t = new Thread(spider);

      t.start();

    }

  }

  

}

 

配置文件:

 

 

 

 

spiderThread=1

 

threadpool.corePoolSize=8

threadpool.maxPoolSize=10

threadpool.keepAliveSeconds=600

threadpool.queueCapacity=1000

 

startPage=1

endPage=20

url=https://www.haha365.com/gxtp/index_gif_%d.htm

 

dir=E:/spider/

 

 

 

 

用haha365的動態gif做瞭下測試,如果想趴別的網站,自己根據人傢的html結構,改一下爬取規則即可。

 

程序中沒做過多的容錯處理,可能存在一定的bug。

 

 

 

源碼下載

 

發佈留言