提取网页图片并保存到本地

2018-12-14 13:03:21  卢浮宫  版权声明:本文为站长原创文章,转载请写明出处


一、之前写了一个使用java爬取网页图片路径的文章,今天这个算是一个补充完善

二、步骤如下:①爬取网页元素②过滤图片元素③提取出图片在先地址④读取图片资源并保存到本地

三、核心代码如下:

    ①处理http请求类(这里只用到了第一个get请求)       

package com.lfg.http;

import java.io.BufferedInputStream;

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.OutputStreamWriter;

import java.io.PrintWriter;

import java.io.Reader;

import java.net.MalformedURLException;

import java.net.URL;

import java.net.URLConnection;

import net.sf.json.JSONObject;

public class HttpUtil {

    public static String getHttpResp(String url){

        URLConnection uc = null;

        try {

            uc = new URL(url).openConnection();

        }

        catch (MalformedURLException e) {

            e.printStackTrace();

        }

        catch (IOException e) {

            e.printStackTrace();

        }

        uc.setConnectTimeout(10000);

        uc.setDoOutput(true);

        InputStream in = null;

        try {

            in = new BufferedInputStream(uc.getInputStream());

        }

        catch (IOException e) {

            e.printStackTrace();

        }

        Reader rd = new InputStreamReader(in);

        int c = 0;

        StringBuffer temp = new StringBuffer();

        try {

            while ((c = rd.read()) != -1) {

                temp.append((char) c);

            }

        }

        catch (IOException e) {

            e.printStackTrace();

        }

        try {

            in.close();

        }

        catch (IOException e) {

            e.printStackTrace();

        }

        return temp.toString();

    }

    public static InputStream getHttpRespAndOutStream(String url){

        URLConnection uc = null;

        try {

            uc = new URL(url).openConnection();

        }

        catch (MalformedURLException e) {

            e.printStackTrace();

        }

        catch (IOException e) {

            e.printStackTrace();

        }

        uc.setConnectTimeout(10000);

        uc.setDoOutput(true);

        InputStream in = null;

        try {

            in = new BufferedInputStream(uc.getInputStream());

        }

        catch (IOException e) {

            e.printStackTrace();

        }

        // try {

        // in.close();

        // } catch (IOException e) {

        // e.printStackTrace();

        // }

        return in;

    }

    public String postHttpResp(String url,String params){

        PrintWriter out = null;

        BufferedReader in = null;

        String result = "";

        try {

            URL realUrl = new URL(url);

            URLConnection conn = realUrl.openConnection();

            conn.setRequestProperty("accept", "*/*");

            conn.setRequestProperty("Content-Type", "application/json;charset=utf-8");

            conn.setRequestProperty("connection", "Keep-Alive");

            conn.setRequestProperty("user-agent",

                                "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");

            conn.setDoOutput(true);

            conn.setDoInput(true);

            out = new PrintWriter(conn.getOutputStream());

            out.print(params);

            out.flush();

            in = new BufferedReader(

                                new InputStreamReader(conn.getInputStream()));

            String line;

            while ((line = in.readLine()) != null) {

                result += line;

            }

        }

        catch (Exception e) {

            System.out.println("发送 POST 请求出现异常!"+e);

            e.printStackTrace();

        }

        finally{

            try{

                if(out!=null){

                    out.close();

                }

                if(in!=null){

                    in.close();

                }

            }

            catch(IOException ex){

                ex.printStackTrace();

            }

        }

        return result;

    }

    public String postHttpRespForJson(String url,JSONObject params){

        System.out.println("请求参数"+ params);

        PrintWriter out = null;

        BufferedReader in = null;

        String result = "";

        try {

            URL realUrl = new URL(url);

            URLConnection conn = realUrl.openConnection();

            conn.setRequestProperty("accept", "*/*");

            conn.setRequestProperty("connection", "Keep-Alive");

            conn.setRequestProperty("user-agent",

                        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");

            conn.setRequestProperty("Accept-Charset", "UTF-8");

            conn.setRequestProperty("contentType", "UTF-8");

            conn.setDoOutput(true);

            conn.setDoInput(true);

            out = new PrintWriter(new OutputStreamWriter(conn.getOutputStream(),"utf-8"));

            out.print(params);

            out.flush();

            in = new BufferedReader(new InputStreamReader(

                                conn.getInputStream(),"UTF-8"));

            String line;

            while ((line = in.readLine()) != null) {

                result += line;

            }

        }

        catch (Exception e) {

            System.out.println("发送 POST 请求出现异常!"+e);

            e.printStackTrace();

        }

        finally{

            try{

                if(out!=null){

                    out.close();

                }

                if(in!=null){

                    in.close();

                }

            }

            catch(IOException ex){

                ex.printStackTrace();

            }

        }

        return result;

    }

    public String getAccessToken(String temp){

        String tt = temp.toString().replace("{", " ").replace("}", " "),access_token = "";

        String[] arr = tt.split(",");

        for (String item : arr) {

            if (item.substring(0,10).equals(""access_to")) {

String str = item.toString().split(":")[1];

access_token = str.substring(1, str.length()-1);

}

}

return access_token;

}

}import java.io.BufferedInputStream;

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.OutputStreamWriter;

import java.io.PrintWriter;

import java.io.Reader;

import java.net.MalformedURLException;

import java.net.URL;

import java.net.URLConnection;

import net.sf.json.JSONObject;

public class HttpUtil {

public static String getHttpResp(String url){

URLConnection uc = null;

try {

uc = new URL(url).openConnection();

} catch (MalformedURLException e) {

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

}

uc.setConnectTimeout(10000);

uc.setDoOutput(true);

InputStream in = null;

try {

in = new BufferedInputStream(uc.getInputStream());

} catch (IOException e) {

e.printStackTrace();

}

Reader rd = new InputStreamReader(in);

int c = 0;

StringBuffer temp = new StringBuffer(); 

try {

while ((c = rd.read()) != -1) {

    temp.append((char) c);

}

} catch (IOException e) {

e.printStackTrace();

}

try {

in.close();

} catch (IOException e) {

e.printStackTrace();

}

return temp.toString();

}

public static InputStream getHttpRespAndOutStream(String url){

URLConnection uc = null;

try {

uc = new URL(url).openConnection();

} catch (MalformedURLException e) {

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

}

uc.setConnectTimeout(10000);

uc.setDoOutput(true);

InputStream in = null;

try {

in = new BufferedInputStream(uc.getInputStream());

} catch (IOException e) {

e.printStackTrace();

}

// try {

// in.close();

// } catch (IOException e) {

// e.printStackTrace();

// }

return in;

}

public String postHttpResp(String url,String params){ 

PrintWriter out = null;

        BufferedReader in = null;

        String result = "";

        try {

            URL realUrl = new URL(url);

            URLConnection conn = realUrl.openConnection();            

            conn.setRequestProperty("accept", "*

            /*");

            conn.setRequestProperty("Content-Type", "application/json;charset=utf-8");

            conn.setRequestProperty("connection", "Keep-Alive");

            conn.setRequestProperty("user-agent",

                    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");

            conn.setDoOutput(true);

            conn.setDoInput(true);

            out = new PrintWriter(conn.getOutputStream());

            out.print(params);

            out.flush();

            in = new BufferedReader(

                    new InputStreamReader(conn.getInputStream()));

            String line;

            while ((line = in.readLine()) != null) {

                result += line;

            }

        } catch (Exception e) {

            System.out.println("发送 POST 请求出现异常!"+e);

            e.printStackTrace();

        }

        finally{

            try{

                if(out!=null){

                    out.close();

                }

                if(in!=null){

                    in.close();

                }

            }

            catch(IOException ex){

                ex.printStackTrace();

            }

        }

        return result;

}

public String postHttpRespForJson(String url,JSONObject params){

System.out.println("请求参数"+ params);

PrintWriter out = null;

        BufferedReader in = null;

        String result = "";

        try {

            URL realUrl = new URL(url);

            URLConnection conn = realUrl.openConnection();

            conn.setRequestProperty("accept", "*/

            *");

            conn.setRequestProperty("connection", "Keep-Alive");

            conn.setRequestProperty("user-agent",

            "Mozilla/4.0 (compatible;

            MSIE 6.0;

            Windows NT 5.1;

            SV1)");

            conn.setRequestProperty("Accept-Charset", "UTF-8");

            conn.setRequestProperty("contentType", "UTF-8");

            conn.setDoOutput(true);

            conn.setDoInput(true);

            out = new PrintWriter(new OutputStreamWriter(conn.getOutputStream(),"utf-8")); 

            out.print(params);

            out.flush();

            in = new BufferedReader(new InputStreamReader(

                    conn.getInputStream(),"UTF-8"));

            String line;

            while ((line = in.readLine()) != null) {

                result += line;

            }

        } catch (Exception e) {

            System.out.println("发送 POST 请求出现异常!"+e);

            e.printStackTrace();

        }

        finally{

            try{

                if(out!=null){

                    out.close();

                }

                if(in!=null){

                    in.close();

                }

            }

            catch(IOException ex){

                ex.printStackTrace();

            }

        }

        return result;

}

public String getAccessToken(String temp){

String tt = temp.toString().replace("{", " ").replace("}", " "),access_token = "";

String[] arr = tt.split(",");

for (String item : arr) {

if (item.substring(0,10).equals(""access_to")) {

                String str = item.toString().split(":")[1];

                access_token = str.substring(1, str.length()-1);

            }

        }

        return access_token;

    }

}

②资源处理核心代码

package com.lfg.crawler;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.lfg.http.HttpUtil;

public class Crawler extends HttpUtil {
public static void main(String[] args) throws MalformedURLException, IOException {
List arrL = new ArrayList();
String url = "http://www.tooopen.com/img/88_879.aspx";
String strHtml =  getHttpResp(url);
// System.out.println(strHtml);
Pattern p = Pattern.compile("<img\b[^>]*\bsrc\b\s*=\s*("|")?([^"" f>]+(\.jpg|\.bmp|\.eps|\.gif|\.mif|\.miff|\.png|\.tif|\.tiff|\.svg|\.wmf|\.jpe|\.jpeg|\.dib|\.ico|\.tga|\.cut|\.pic)\b)[^>]*>", Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(strHtml );
String quote,src = "";
while (m.find()) {
quote = m.group(1);   
src = (quote == null || quote.trim().length() == 0) ? m.group(2).split("\s+")[0] : m.group(2);
if(src.subSequence(0, 4).equals("http")){
arrL.add(src);
}
}
for (Object sstr : arrL) {
InputStream inStream = getHttpRespAndOutStream(sstr.toString());
try {
byte[] data = readInputStream(inStream);
        File imageFile = new File("D:\crawlerImg\Img" + arrL.indexOf(sstr) + ".jpg");  
        //创建输出流  
        FileOutputStream outStream = new FileOutputStream(imageFile);  
        //写入数据  
        outStream.write(data);  
        //关闭输出流  
        outStream.close(); 
} catch (Exception e) {
e.printStackTrace();
}
}
System.out.println("工作已完成...");
}

public static byte[] readInputStream(InputStream inStream) throws Exception{  
ByteArrayOutputStream output = new ByteArrayOutputStream();
    byte[] buffer = new byte[4096];
    int n = 0;
    while (-1 != (n = inStream.read(buffer))) {
        output.write(buffer, 0, n);
    }
    return output.toByteArray();
    }  
}




更多精彩请关注guangmuhua.com


最新评论:

郭某到此一游
2019-01-16 10:24:47
1楼