2018-12-14 13:03:21 卢浮宫 版权声明:本文为站长原创文章,转载请写明出处
一、之前写了一个使用java爬取网页图片路径的文章,今天这个算是一个补充完善
二、步骤如下:①爬取网页元素②过滤图片元素③提取出图片在先地址④读取图片资源并保存到本地
三、核心代码如下:
①处理http请求类(这里只用到了第一个get请求)
②资源处理核心代码
package com.lfg.http;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import net.sf.json.JSONObject;
public class HttpUtil {
public static String getHttpResp(String url){
URLConnection uc = null;
try {
uc = new URL(url).openConnection();
}
catch (MalformedURLException e) {
e.printStackTrace();
}
catch (IOException e) {
e.printStackTrace();
}
uc.setConnectTimeout(10000);
uc.setDoOutput(true);
InputStream in = null;
try {
in = new BufferedInputStream(uc.getInputStream());
}
catch (IOException e) {
e.printStackTrace();
}
Reader rd = new InputStreamReader(in);
int c = 0;
StringBuffer temp = new StringBuffer();
try {
while ((c = rd.read()) != -1) {
temp.append((char) c);
}
}
catch (IOException e) {
e.printStackTrace();
}
try {
in.close();
}
catch (IOException e) {
e.printStackTrace();
}
return temp.toString();
}
public static InputStream getHttpRespAndOutStream(String url){
URLConnection uc = null;
try {
uc = new URL(url).openConnection();
}
catch (MalformedURLException e) {
e.printStackTrace();
}
catch (IOException e) {
e.printStackTrace();
}
uc.setConnectTimeout(10000);
uc.setDoOutput(true);
InputStream in = null;
try {
in = new BufferedInputStream(uc.getInputStream());
}
catch (IOException e) {
e.printStackTrace();
}
// try {
// in.close();
// } catch (IOException e) {
// e.printStackTrace();
// }
return in;
}
public String postHttpResp(String url,String params){
PrintWriter out = null;
BufferedReader in = null;
String result = "";
try {
URL realUrl = new URL(url);
URLConnection conn = realUrl.openConnection();
conn.setRequestProperty("accept", "*/*");
conn.setRequestProperty("Content-Type", "application/json;charset=utf-8");
conn.setRequestProperty("connection", "Keep-Alive");
conn.setRequestProperty("user-agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
conn.setDoOutput(true);
conn.setDoInput(true);
out = new PrintWriter(conn.getOutputStream());
out.print(params);
out.flush();
in = new BufferedReader(
new InputStreamReader(conn.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
}
catch (Exception e) {
System.out.println("发送 POST 请求出现异常!"+e);
e.printStackTrace();
}
finally{
try{
if(out!=null){
out.close();
}
if(in!=null){
in.close();
}
}
catch(IOException ex){
ex.printStackTrace();
}
}
return result;
}
public String postHttpRespForJson(String url,JSONObject params){
System.out.println("请求参数"+ params);
PrintWriter out = null;
BufferedReader in = null;
String result = "";
try {
URL realUrl = new URL(url);
URLConnection conn = realUrl.openConnection();
conn.setRequestProperty("accept", "*/*");
conn.setRequestProperty("connection", "Keep-Alive");
conn.setRequestProperty("user-agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
conn.setRequestProperty("Accept-Charset", "UTF-8");
conn.setRequestProperty("contentType", "UTF-8");
conn.setDoOutput(true);
conn.setDoInput(true);
out = new PrintWriter(new OutputStreamWriter(conn.getOutputStream(),"utf-8"));
out.print(params);
out.flush();
in = new BufferedReader(new InputStreamReader(
conn.getInputStream(),"UTF-8"));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
}
catch (Exception e) {
System.out.println("发送 POST 请求出现异常!"+e);
e.printStackTrace();
}
finally{
try{
if(out!=null){
out.close();
}
if(in!=null){
in.close();
}
}
catch(IOException ex){
ex.printStackTrace();
}
}
return result;
}
public String getAccessToken(String temp){
String tt = temp.toString().replace("{", " ").replace("}", " "),access_token = "";
String[] arr = tt.split(",");
for (String item : arr) {
if (item.substring(0,10).equals(""access_to")) {
String str = item.toString().split(":")[1];
access_token = str.substring(1, str.length()-1);
}
}
return access_token;
}
}import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import net.sf.json.JSONObject;
public class HttpUtil {
public static String getHttpResp(String url){
URLConnection uc = null;
try {
uc = new URL(url).openConnection();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
uc.setConnectTimeout(10000);
uc.setDoOutput(true);
InputStream in = null;
try {
in = new BufferedInputStream(uc.getInputStream());
} catch (IOException e) {
e.printStackTrace();
}
Reader rd = new InputStreamReader(in);
int c = 0;
StringBuffer temp = new StringBuffer();
try {
while ((c = rd.read()) != -1) {
temp.append((char) c);
}
} catch (IOException e) {
e.printStackTrace();
}
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
return temp.toString();
}
public static InputStream getHttpRespAndOutStream(String url){
URLConnection uc = null;
try {
uc = new URL(url).openConnection();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
uc.setConnectTimeout(10000);
uc.setDoOutput(true);
InputStream in = null;
try {
in = new BufferedInputStream(uc.getInputStream());
} catch (IOException e) {
e.printStackTrace();
}
// try {
// in.close();
// } catch (IOException e) {
// e.printStackTrace();
// }
return in;
}
public String postHttpResp(String url,String params){
PrintWriter out = null;
BufferedReader in = null;
String result = "";
try {
URL realUrl = new URL(url);
URLConnection conn = realUrl.openConnection();
conn.setRequestProperty("accept", "*
/*");
conn.setRequestProperty("Content-Type", "application/json;charset=utf-8");
conn.setRequestProperty("connection", "Keep-Alive");
conn.setRequestProperty("user-agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
conn.setDoOutput(true);
conn.setDoInput(true);
out = new PrintWriter(conn.getOutputStream());
out.print(params);
out.flush();
in = new BufferedReader(
new InputStreamReader(conn.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
} catch (Exception e) {
System.out.println("发送 POST 请求出现异常!"+e);
e.printStackTrace();
}
finally{
try{
if(out!=null){
out.close();
}
if(in!=null){
in.close();
}
}
catch(IOException ex){
ex.printStackTrace();
}
}
return result;
}
public String postHttpRespForJson(String url,JSONObject params){
System.out.println("请求参数"+ params);
PrintWriter out = null;
BufferedReader in = null;
String result = "";
try {
URL realUrl = new URL(url);
URLConnection conn = realUrl.openConnection();
conn.setRequestProperty("accept", "*/
*");
conn.setRequestProperty("connection", "Keep-Alive");
conn.setRequestProperty("user-agent",
"Mozilla/4.0 (compatible;
MSIE 6.0;
Windows NT 5.1;
SV1)");
conn.setRequestProperty("Accept-Charset", "UTF-8");
conn.setRequestProperty("contentType", "UTF-8");
conn.setDoOutput(true);
conn.setDoInput(true);
out = new PrintWriter(new OutputStreamWriter(conn.getOutputStream(),"utf-8"));
out.print(params);
out.flush();
in = new BufferedReader(new InputStreamReader(
conn.getInputStream(),"UTF-8"));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
} catch (Exception e) {
System.out.println("发送 POST 请求出现异常!"+e);
e.printStackTrace();
}
finally{
try{
if(out!=null){
out.close();
}
if(in!=null){
in.close();
}
}
catch(IOException ex){
ex.printStackTrace();
}
}
return result;
}
public String getAccessToken(String temp){
String tt = temp.toString().replace("{", " ").replace("}", " "),access_token = "";
String[] arr = tt.split(",");
for (String item : arr) {
if (item.substring(0,10).equals(""access_to")) {
String str = item.toString().split(":")[1];
access_token = str.substring(1, str.length()-1);
}
}
return access_token;
}
}
package com.lfg.crawler;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.lfg.http.HttpUtil;
public class Crawler extends HttpUtil {
public static void main(String[] args) throws MalformedURLException, IOException {
List arrL = new ArrayList();
String url = "http://www.tooopen.com/img/88_879.aspx";
String strHtml = getHttpResp(url);
// System.out.println(strHtml);
Pattern p = Pattern.compile("<img\b[^>]*\bsrc\b\s*=\s*("|")?([^"" f>]+(\.jpg|\.bmp|\.eps|\.gif|\.mif|\.miff|\.png|\.tif|\.tiff|\.svg|\.wmf|\.jpe|\.jpeg|\.dib|\.ico|\.tga|\.cut|\.pic)\b)[^>]*>", Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(strHtml );
String quote,src = "";
while (m.find()) {
quote = m.group(1);
src = (quote == null || quote.trim().length() == 0) ? m.group(2).split("\s+")[0] : m.group(2);
if(src.subSequence(0, 4).equals("http")){
arrL.add(src);
}
}
for (Object sstr : arrL) {
InputStream inStream = getHttpRespAndOutStream(sstr.toString());
try {
byte[] data = readInputStream(inStream);
File imageFile = new File("D:\crawlerImg\Img" + arrL.indexOf(sstr) + ".jpg");
//创建输出流
FileOutputStream outStream = new FileOutputStream(imageFile);
//写入数据
outStream.write(data);
//关闭输出流
outStream.close();
} catch (Exception e) {
e.printStackTrace();
}
}
System.out.println("工作已完成...");
}
public static byte[] readInputStream(InputStream inStream) throws Exception{
ByteArrayOutputStream output = new ByteArrayOutputStream();
byte[] buffer = new byte[4096];
int n = 0;
while (-1 != (n = inStream.read(buffer))) {
output.write(buffer, 0, n);
}
return output.toByteArray();
}
}