大战熟女丰满人妻av-荡女精品导航-岛国aaaa级午夜福利片-岛国av动作片在线观看-岛国av无码免费无禁网站-岛国大片激情做爰视频

專注Java教育14年 全國咨詢/投訴熱線:400-8080-105
動力節點LOGO圖
始于2009,口口相傳的Java黃埔軍校
首頁 hot資訊 封裝Java爬蟲工具類

封裝Java爬蟲工具類

更新時間:2021-09-22 11:12:00 來源:動力節點 瀏覽1027次

封裝了一個JAVA爬蟲工具類。

1.maven引用jar

   <dependency>
		<groupId>net.sourceforge.htmlunit</groupId>
		<artifactId>htmlunit</artifactId>
		<version>2.27</version>
	</dependency>
	<dependency>
		<groupId>org.jsoup</groupId>
		<artifactId>jsoup</artifactId>
		<version>1.8.3</version>
	</dependency>    

2.工具類

  public class HttpHtmlUnit {
	/**
	 * 請求超時時間,默認20000ms
	 */
	private int timeout = 20000;
	/**
	 * 等待異步JS執行時間,默認20000ms
	 */
	private int waitForBackgroundJavaScript = 20000;
	/**
	 * cookie表
	 */
	private Map<String, String> cookieMap = new HashMap<>();
/**
 * 請求編碼(處理返回結果),默認UTF-8
 */
private String charset = "UTF-8";
private static HttpHtmlUnit httpUtils;
private HttpHtmlUnit() {
}
/**
 * 獲取實例
 *
 * @return
 */
public static HttpHtmlUnit getInstance() {
	if (httpUtils == null)
		httpUtils = new HttpHtmlUnit();
	return httpUtils;
}
/**
 * 清空cookieMap
 */
public void invalidCookieMap() {
	cookieMap.clear();
}
public int getTimeout() {
	return timeout;
}
/**
 * 設置請求超時時間
 *
 * @param timeout
 */
public void setTimeout(int timeout) {
	this.timeout = timeout;
}
public String getCharset() {
	return charset;
}
/**
 * 設置請求字符編碼集
 *
 * @param charset
 */
public void setCharset(String charset) {
	this.charset = charset;
}
public int getWaitForBackgroundJavaScript() {
	return waitForBackgroundJavaScript;
}
/**
 * 設置獲取完整HTML頁面時等待異步JS執行的時間
 *
 * @param waitForBackgroundJavaScript
 */
public void setWaitForBackgroundJavaScript(int waitForBackgroundJavaScript) {
	this.waitForBackgroundJavaScript = waitForBackgroundJavaScript;
}
/**
 * 將網頁返回為解析后的文檔格式
 * 
 * @param html
 * @return
 * @throws Exception
 */
public static Document parseHtmlToDoc(String html) throws Exception {
	return removeHtmlSpace(html);
}
private static Document removeHtmlSpace(String str) {
	Document doc = Jsoup.parse(str);
	String result = doc.html().replace("&nbsp;", "");
	return Jsoup.parse(result);
}
/**
 * 執行get請求,返回doc
 *
 * @param url
 * @return
 * @throws Exception
 */
public Document executeGetAsDocument(String url) throws Exception {
	return parseHtmlToDoc(executeGet(url));
}
/**
 * 執行get請求
 *
 * @param url
 * @return
 * @throws Exception
 */
public String executeGet(String url) throws Exception {
	HttpGet httpGet = new HttpGet(url);
	httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));
	httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	CloseableHttpClient httpClient = null;
	String str = "";
	try {
		httpClient = HttpClientBuilder.create().build();
		HttpClientContext context = HttpClientContext.create();
		CloseableHttpResponse response = httpClient.execute(httpGet, context);
		getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
		int state = response.getStatusLine().getStatusCode();
		if (state == 404) {
			str = "";
		}
		try {
			HttpEntity entity = response.getEntity();
			if (entity != null) {
				str = EntityUtils.toString(entity, charset);
			}
		} finally {
			response.close();
		}
	} catch (IOException e) {
		throw e;
	} finally {
		try {
			if (httpClient != null)
				httpClient.close();
		} catch (IOException e) {
			throw e;
		}
	}
	return str;
}
/**
 * 用https執行get請求,返回doc
 *
 * @param url
 * @return
 * @throws Exception
 */
public Document executeGetWithSSLAsDocument(String url) throws Exception {
	return parseHtmlToDoc(executeGetWithSSL(url));
}
/**
 * 用https執行get請求
 *
 * @param url
 * @return
 * @throws Exception
 */
public String executeGetWithSSL(String url) throws Exception {
	HttpGet httpGet = new HttpGet(url);
	httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));
	httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	CloseableHttpClient httpClient = null;
	String str = "";
	try {
		httpClient = createSSLInsecureClient();
		HttpClientContext context = HttpClientContext.create();
		CloseableHttpResponse response = httpClient.execute(httpGet, context);
		getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
		int state = response.getStatusLine().getStatusCode();
		if (state == 404) {
			str = "";
		}
		try {
			HttpEntity entity = response.getEntity();
			if (entity != null) {
				str = EntityUtils.toString(entity, charset);
			}
		} finally {
			response.close();
		}
	} catch (IOException e) {
		throw e;
	} catch (GeneralSecurityException ex) {
		throw ex;
	} finally {
		try {
			if (httpClient != null)
				httpClient.close();
		} catch (IOException e) {
			throw e;
		}
	}
	return str;
}
/**
 * 執行post請求,返回doc
 *
 * @param url
 * @param params
 * @return
 * @throws Exception
 */
public Document executePostAsDocument(String url, Map<String, String> params) throws Exception {
	return parseHtmlToDoc(executePost(url, params));
}
/**
 * 執行post請求
 *
 * @param url
 * @param params
 * @return
 * @throws Exception
 */
public String executePost(String url, Map<String, String> params) throws Exception {
	String reStr = "";
	HttpPost httpPost = new HttpPost(url);
	httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));
	List<NameValuePair> paramsRe = new ArrayList<>();
	for (Map.Entry<String, String> entry : params.entrySet()) {
		paramsRe.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
	}
	CloseableHttpClient httpclient = HttpClientBuilder.create().build();
	CloseableHttpResponse response;
	try {
		httpPost.setEntity(new UrlEncodedFormEntity(paramsRe));
		HttpClientContext context = HttpClientContext.create();
		response = httpclient.execute(httpPost, context);
		getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
		HttpEntity entity = response.getEntity();
		reStr = EntityUtils.toString(entity, charset);
	} catch (IOException e) {
		throw e;
	} finally {
		httpPost.releaseConnection();
	}
	return reStr;
}
/**
 * 用https執行post請求,返回doc
 *
 * @param url
 * @param params
 * @return
 * @throws Exception
 */
public Document executePostWithSSLAsDocument(String url, Map<String, String> params) throws Exception {
	return parseHtmlToDoc(executePostWithSSL(url, params));
}
/**
 * 用https執行post請求
 *
 * @param url
 * @param params
 * @return
 * @throws Exception
 */
public String executePostWithSSL(String url, Map<String, String> params) throws Exception {
	String re = "";
	HttpPost post = new HttpPost(url);
	List<NameValuePair> paramsRe = new ArrayList<>();
	for (Map.Entry<String, String> entry : params.entrySet()) {
		paramsRe.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
	}
	post.setHeader("Cookie", convertCookieMapToString(cookieMap));
	post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	CloseableHttpResponse response;
	try {
		CloseableHttpClient httpClientRe = createSSLInsecureClient();
		HttpClientContext contextRe = HttpClientContext.create();
		post.setEntity(new UrlEncodedFormEntity(paramsRe));
		response = httpClientRe.execute(post, contextRe);
		HttpEntity entity = response.getEntity();
		if (entity != null) {
			re = EntityUtils.toString(entity, charset);
		}
		getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);
	} catch (Exception e) {
		throw e;
	}
	return re;
}
/**
 * 發送JSON格式body的POST請求
 *
 * @param url 地址
 * @param jsonBody json body
 * @return
 * @throws Exception
 */
public String executePostWithJson(String url, String jsonBody) throws Exception {
	String reStr = "";
	HttpPost httpPost = new HttpPost(url);
	httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));
	CloseableHttpClient httpclient = HttpClientBuilder.create().build();
	CloseableHttpResponse response;
	try {
		httpPost.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));
		HttpClientContext context = HttpClientContext.create();
		response = httpclient.execute(httpPost, context);
		getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
		HttpEntity entity = response.getEntity();
		reStr = EntityUtils.toString(entity, charset);
	} catch (IOException e) {
		throw e;
	} finally {
		httpPost.releaseConnection();
	}
	return reStr;
}
/**
 * 發送JSON格式body的SSL POST請求
 *
 * @param url 地址
 * @param jsonBody json body
 * @return
 * @throws Exception
 */
public String executePostWithJsonAndSSL(String url, String jsonBody) throws Exception {
	String re = "";
	HttpPost post = new HttpPost(url);
	post.setHeader("Cookie", convertCookieMapToString(cookieMap));
	post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	CloseableHttpResponse response;
	try {
		CloseableHttpClient httpClientRe = createSSLInsecureClient();
		HttpClientContext contextRe = HttpClientContext.create();
		post.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));
		response = httpClientRe.execute(post, contextRe);
		HttpEntity entity = response.getEntity();
		if (entity != null) {
			re = EntityUtils.toString(entity, charset);
		}
		getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);
	} catch (Exception e) {
		throw e;
	}
	return re;
}
/**
 * 獲取頁面文檔字串(等待異步JS執行)
 *
 * @param url 頁面URL
 * @return
 * @throws Exception
 */
public String getHtmlPageResponse(String url) throws Exception {
	String result = "";
	final WebClient webClient = new WebClient(BrowserVersion.CHROME);
	webClient.getOptions().setThrowExceptionOnScriptError(false);//當JS執行出錯的時候是否拋出異常
	webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//當HTTP的狀態非200時是否拋出異常
	webClient.getOptions().setActiveXNative(true);
	webClient.getOptions().setCssEnabled(true);//是否啟用CSS
	webClient.getOptions().setJavaScriptEnabled(true); //很重要,啟用JS
	webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,設置支持AJAX
	webClient.getOptions().setTimeout(timeout);//設置“瀏覽器”的請求超時時間
	webClient.setJavaScriptTimeout(timeout);//設置JS執行的超時時間
	HtmlPage page;
	try {
		page = webClient.getPage(url);
	} catch (Exception e) {
		webClient.close();
		throw e;
	}
	webClient.waitForBackgroundJavaScript(waitForBackgroundJavaScript);//該方法阻塞線程
	result = page.asXml();
	webClient.close();
	return result;
}
/**
 * 獲取頁面文檔Document對象(等待異步JS執行)
 *
 * @param url 頁面URL
 * @return
 * @throws Exception
 */
public Document getHtmlPageResponseAsDocument(String url) throws Exception {
	return parseHtmlToDoc(getHtmlPageResponse(url));
}
private void getCookiesFromCookieStore(CookieStore cookieStore, Map<String, String> cookieMap) {
	List<Cookie> cookies = cookieStore.getCookies();
	for (Cookie cookie : cookies) {
		cookieMap.put(cookie.getName(), cookie.getValue());
	}
}
private String convertCookieMapToString(Map<String, String> map) {
	String cookie = "";
	for (Map.Entry<String, String> entry : map.entrySet()) {
		cookie += (entry.getKey() + "=" + entry.getValue() + "; ");
	}
	if (map.size() > 0) {
		cookie = cookie.substring(0, cookie.length() - 2);
	}
	return cookie;
}
/**
 * 創建 SSL連接
 *
 * @return
 * @throws GeneralSecurityException
 */
private static CloseableHttpClient createSSLInsecureClient() throws GeneralSecurityException {
	try {
		SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, (chain, authType) -> true).build();
		SSLConnectionSocketFactory sslConnectionSocketFactory = new SSLConnectionSocketFactory(sslContext,
				(s, sslContextL) -> true);
		return HttpClients.custom().setSSLSocketFactory(sslConnectionSocketFactory).build();
	} catch (GeneralSecurityException e) {
		throw e;
	}
}

3.遇到的問題:

htmlunit引用common-io的版本較低。如果項目中其它地方有應用common-io較高版本,版本沖突會導致問題。處理版本沖突可參照maven引用依賴原則,pom文件中放置在較前位置的版本先被引用到處理。

Java開發工具有很多,大家以后可以慢慢了解,有些工具是比較常用的,大家可要掌握哦。

提交申請后,顧問老師會電話與您溝通安排學習

免費課程推薦 >>
技術文檔推薦 >>
主站蜘蛛池模板: 日本欧美一区二区三区在线观看 | 我想看一级黄色毛片 | 四虎精品影院永久在线播放 | 精品免费tv久久久久久久 | 日韩免费在线视频观看 | 不卡神马影院 | 美女一级a毛片免费观看 | 色青青草原桃花久久综合 | 美女一级毛片免费观看 | 99视频精品全部在线播放 | 亚洲精品国产国语 | 看免费的毛片 | 一级毛片中文字幕 | 国产精品夜色视频一级区 | 久久精品国产2020观看福利色 | 天天干天天在线 | 亚洲国产欧美日韩一区二区三区 | 欧美www| 国产精品麻豆a啊在线观看 国产精品麻豆高清在线观看 | 亚洲欧美激情精品一区二区 | 久久不射网 | 四虎免费看片 | 一级a欧美毛片 | 成人网18免费网站 | 天堂毛片| 久久免费香蕉视频 | 国产午夜成人无码免费看 | 久久综合九色综合欧美狠狠 | 99精品国产一区二区青青牛奶 | 亚洲免费视频网址 | 91尤物视频 | 久久久高清国产999尤物 | 成人国产一区二区 | 91在线九色| 色噜噜狠狠色综合免费视频 | 色婷婷免费视频 | 亚洲精品久久久久中文字幕一区 | 精品一区二区三区色花堂 | 欧美精品专区第1页 | 99九九精品免费视频观看 | 香蕉精品视频在线观看入口 |