import .....
/**
* 获取****的数据
*/
public class domain3 {
/**
* 根据网页url获取页面内容
*/
public string gethtmlstring(string url){
string hs="";
try {
url u = new url(url);
httpurlconnection conn = (httpurlconnection)u.openconnection();
conn.setrequestproperty("user-agent","msie 7.0");
stringbuffer htmlstring = new stringbuffer();
bufferedreader br = new bufferedreader(new inputstreamreader(conn.getinputstream(),"utf-8"));
string line="";
while((line=br.readline())!=null){
htmlstring.append(line+"\n");
}
hs=htmlstring.tostring();
system.out.println(url);
} catch (exception e) {
system.out.println("url地址加载出错!!");
e.printstacktrace();
}
return hs;
}
public static void main(string rags[]){
dao d = new dao();
domain3 dm = new domain3();
string title="";
string section="";
string content="";
string contenttitle="";
int count=110;
string url="http://*************************" ;
if(d.createtable()){
system.out.println("建表成功!!!");
try {
//加载标题页面
document doc = jsoup.parse(dm.gethtmlstring(url));
element titles = doc.getelementbyid("maincontent");
elements lis=titles.getelementsbytag("li");
//*********************标题****************************
for(int i=1;i
if(a.tostring().equals("")){
title=lis.get(i).text();
contenttitle=title;
string data[]={contenttitle,title,section,content,url};
if(d.pinsertdata(data)){
system.out.println("第"+(i+1)+"题数据插入成功!!!");
system.out.println("*****************"+count+"*****************");
}else{
system.out.println("第"+(i+1)+"题节数据插入失败!!!");
system.out.println("*****************"+count+"*****************");
break;
}
count++;
continue;
}else{
title=a.get(0).text();
url="http://****************"+a.get(0).attr("href");
//加载章节页面
document doc2=jsoup.parse(dm.gethtmlstring(url));
element sections =doc2.getelementbyid("maincontent");
elements ls = sections.getelementsbytag("li");
//**********************节************************
for(int j=0;j
if(link.tostring().equals("")){
section=ls.get(j).text();
contenttitle=title+" "+section;
}else{
section = link.get(0).text();
url="http:*******************"+link.get(0).attr("href");
//加载内容页面
document doc3=jsoup.parse(dm.gethtmlstring(url));
element contents=doc3.getelementbyid("maincontent");
content=contents.text();
//处理内容字符串
content=content.substring(content.indexof("?")+"?".length());
content=content.replace("'", "''");
contenttitle=title+" "+section;
}
system.out.println("****************"+count+"******************");
system.out.println("正在读第"+(i+1)+"题"+(j+1)+"节");
//往数据库插入数据
string data[]={contenttitle,title,section,content,url};
if(d.pinsertdata(data)){
system.out.println("第"+(i+1)+"题"+(j+1)+"节数据插入成功!!!");
system.out.println("*****************"+count+"*****************");
count++;
}else{
system.out.println("第"+(i+1)+"题"+(j+1)+"节数据插入失败!!!");
system.out.println("*****************"+count+"*****************");
break;
}
}//end for
}
system.out.println("第"+(i+1)+"题采集完毕");
}//end for
system.out.println("采集完毕!!");
} catch (exception e) {
// todo auto-generated catch block
e.printstacktrace();
}
?
开多个线程跑
主要是这两句,debug的时候老是在这两句停好长时间
1.BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(),"utf-8"))
2.while((line=br.readLine())!=null){
HtmlString.append(line+"\n");
}
用jsoup吧,很简单,也很好爬
一开始就是用的jsoup 效率比这个还低,就在 Document doc = Jsoup.parse(method.getResponseBodyAsString()); 这一步就走不动了,很头疼,有人建议我用sax解析,但是sax能用来解析html吗?
多线程+提高带宽
**
* 获取**************的数据
* @author wf
*
*/
public class DoMain5 {
public Document getDoc(String url){
Document doc=null;
try {
doc=Jsoup.connect(url).get();
} catch (Exception e) {
System.out.println("文档解析失败!!");
e.printStackTrace();
}
return doc;
}
public static void main(String rags[]){
Dao d = new Dao();
DoMain5 dm = new DoMain5();
String title="";
String section="";
String content="";
String contentTitle="";
int count=630;
String url="******************" ;
if(d.createTable()){
System.out.println("建表成功!!!");
try {
Document doc = dm.getDoc(url);
System.out.println(doc);
Element titles = doc.getElementById("maincontent");
Elements lis=titles.getElementsByTag("li");
//*********************标题****************************
for(int i=1;i
if(a.toString().equals("")){
title=lis.get(i).text();
contentTitle=title;
String data[]={contentTitle,title,section,content,url};
if(d.pinsertData(data)){
System.out.println("第"+(i+1)+"题数据插入成功!!!");
System.out.println("*****************"+count+"*****************");
}else{
System.out.println("第"+(i+1)+"题节数据插入失败!!!");
System.out.println("*****************"+count+"*****************");
break;
}
count++;
continue;
}else{
title=a.get(0).text();
url="http:***************"+a.get(0).attr("href");
Document doc2=dm.getDoc(url);
Element sections =doc2.getElementById("maincontent");
Elements ls = sections.getElementsByTag("li");
//**********************节************************
for(int j=507;j
if(link.toString().equals("")){
section=ls.get(j).text();
contentTitle=title+" "+section;
}else{
section = link.get(0).text();
url="http:****************"+link.get(0).attr("href");
Document doc3=dm.getDoc(url);
Element contents=doc3.getElementById("maincontent");
content=contents.text();
//处理内容字符串
content=content.substring(content.indexOf("?")+"?".length());
content=content.replace("'", "''");
contentTitle=title+" "+section;
}
System.out.println("****************"+count+"******************");
System.out.println("正在读第"+(i+1)+"题"+(j+1)+"节");
String data[]={contentTitle,title,section,content,url};
if(d.pinsertData(data)){
System.out.println("第"+(i+1)+"题"+(j+1)+"节数据插入成功!!!");
System.out.println("*****************"+count+"*****************");
count++;
}else{
System.out.println("第"+(i+1)+"题"+(j+1)+"节数据插入失败!!!");
System.out.println("*****************"+count+"*****************");
break;
}
}//end for
}
System.out.println("第"+(i+1)+"题采集完毕");
break;
}//end for
System.out.println("采集完毕!!");
} catch (Exception e) {
e.printStackTrace();
}
经过各位大声指点修改后 这个程序效率有明显提高,不过现在运行起来随时随地会抛出下面两个异常,还请各位大虾指点怎么解决:
1.java.net.SocketTimeoutException: Read timed out
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.read(SocketInputStream.java:129)
at java.io.BufferedInputStream.fill(BufferedInputStream.java:218)
at java.io.BufferedInputStream.read1(BufferedInputStream.java:258)
at java.io.BufferedInputStream.read(BufferedInputStream.java:317)
at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:687)
at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:632)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream
(HttpURLConnection.java:1064)
at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:373)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:429)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:410)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:164)
at org.jsoup.helper.HttpConnection.get(HttpConnection.java:153)
at com.wanfang.dousact.DoMain5.getDoc(DoMain5.java:35)
at com.wanfang.dousact.DoMain5.main(DoMain5.java:61)
2.java.net.SocketTimeoutException: connect timed out
at java.net.PlainSocketImpl.socketConnect(Native Method)
at java.net.PlainSocketImpl.doConnect(PlainSocketImpl.java:333)
at java.net.PlainSocketImpl.connectToAddress(PlainSocketImpl.java:195)
at java.net.PlainSocketImpl.connect(PlainSocketImpl.java:182)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:366)
at java.net.Socket.connect(Socket.java:519)
at sun.net.NetworkClient.doConnect(NetworkClient.java:158)
at sun.net.www.http.HttpClient.openServer(HttpClient.java:394)
at sun.net.www.http.HttpClient.openServer(HttpClient.java:529)
at sun.net.www.http.HttpClient.
at sun.net.www.http.HttpClient.New(HttpClient.java:306)
at sun.net.www.http.HttpClient.New(HttpClient.java:323)
at sun.net.www.protocol.http.HttpURLConnection.getNewHttpClient
(HttpURLConnection.java:852)
at sun.net.www.protocol.http.HttpURLConnection.plainConnect
(HttpURLConnection.java:793)
at sun.net.www.protocol.http.HttpURLConnection.connect(HttpURLConnection.java:718)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:425)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:410)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:164)
at org.jsoup.helper.HttpConnection.get(HttpConnection.java:153)
at com.wanfang.dousact.DoMain5.getDoc(DoMain5.java:35)
at com.wanfang.dousact.DoMain5.main(DoMain5.java:87)
你可以参考这个看看
https://www.baidu.com/s?wd=jsoup%20%E5%A4%AA%E6%85%A2&rsv_spt=1&rsv_iqid=0xa4c58e5b0001928e&issp=1&f=3&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&oq=java%2520write%2520%255Ct%2520%25E6%25B2%25A1%25E7%2594%25A8&inputT=11038&rsv_t=64ecKnkhyG%2Bspt6MnEr2Ttfue0gE4iYduVY65jj1n6jePnM1gL%2FwO3GVvk4XcSPt8z5R&rsv_pq=e14f800e00019f3d&sug=jsoup%E8%A7%A3%E6%9E%90html&rsv_sug3=27&rsv_sug1=19&rsv_n=1&rsv_sug2=0&prefixsug=jsoup%2520%25E5%25A4%25AA%25E6%2585%25A2&rsp=0&rsv_sug4=12115
HTML怎么学习?HTML怎么入门?HTML在哪学?HTML怎么学才快?不用担心,这里为大家提供了HTML速学教程(入门课程),有需要的小伙伴保存下载就能学习啦!
Copyright 2014-2025 https://www.php.cn/ All Rights Reserved | php.cn | 湘ICP备2023035733号