使用Java调用百度搜索

yangshangchuan

浏览: 2450106 次
性别:
来自: 北京

最近访客更多访客>>

akingde

feilafei123

wf_chn

hero.niu_126.com

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

java

java 搜索引擎百度搜索API

search托管于github

如何利用Java来调用百度搜索，更多细节请到github上查看search

自己没搜索引擎，又想要大规模的数据源，怎么办？可以对百度搜索善加利用，以小搏大，站在巨人的肩膀上。有很多的应用场景可以很巧妙地借助百度搜索来实现，比如网站的新闻采集，比如技术、品牌的新闻跟踪，比如知识库的收集，比如人机问答系统等，我之前做的一个准确率达百分之九十几的人机问答系统的数据源，其中一部分就是充分利用了百度搜索。我们可以很容易地扩展到其他的搜索引擎，使用JSoup+CSSPath技术，轻松获取页面的自定义的内容。

package org.apdplat.search;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class JSoupBaiduSearcher extends AbstractBaiduSearcher{
    private static final Logger LOG = LoggerFactory.getLogger(JSoupBaiduSearcher.class);

    @Override
    public SearchResult search(String keyword) {
        return search(keyword, 1);
    }
    @Override
    public SearchResult search(String keyword, int page) {
        int pageSize = 10;
        //百度搜索结果每页大小为10，pn参数代表的不是页数，而是返回结果的开始数
        //如获取第一页则pn=0，第二页则pn=10，第三页则pn=20，以此类推，抽象出模式：(page-1)*pageSize
        String url = "http://www.baidu.com/s?pn="+(page-1)*pageSize+"&wd="+keyword;
        
        SearchResult searchResult = new SearchResult();
        searchResult.setPage(page);
        List<Webpage> webpages = new ArrayList<>();
        try {
            Document document = Jsoup.connect(url).get();
            
            //获取搜索结果数目
            int total = getBaiduSearchResultCount(document);
            searchResult.setTotal(total);
            int len = 10;
            if (total < 1) {
                return null;
            }
            //如果搜索到的结果不足一页
            if (total < 10) {
                len = total;
            }
            for (int i = 0; i < len; i++) {
                String titleCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container h3.t a";
                String summaryCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container div.c-abstract";
                LOG.debug("titleCssQuery:" + titleCssQuery);
                LOG.debug("summaryCssQuery:" + summaryCssQuery);
                Element titleElement = document.select(titleCssQuery).first();
                String href = "";
                String titleText = "";
                if(titleElement != null){
                    titleText = titleElement.text();
                    href = titleElement.attr("href");
                }else{
                    //处理百度百科
                    titleCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op h3.t a";
                    summaryCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op div p";
                    LOG.debug("处理百度百科 titleCssQuery:" + titleCssQuery);
                    LOG.debug("处理百度百科 summaryCssQuery:" + summaryCssQuery);
                    titleElement = document.select(titleCssQuery).first();
                    if(titleElement != null){
                        titleText = titleElement.text();
                        href = titleElement.attr("href");
                    }
                }
                LOG.debug(titleText);
                Element summaryElement = document.select(summaryCssQuery).first();
                //处理百度知道
                if(summaryElement == null){
                    summaryCssQuery = summaryCssQuery.replace("div.c-abstract","font");
                    LOG.debug("处理百度知道 summaryCssQuery:" + summaryCssQuery);
                    summaryElement = document.select(summaryCssQuery).first();
                }
                String summaryText = "";
                if(summaryElement != null){
                    summaryText = summaryElement.text(); 
                }
                LOG.debug(summaryText);                
                
                if (titleText != null && !"".equals(titleText.trim()) && summaryText != null && !"".equals(summaryText.trim())) {
                    Webpage webpage = new Webpage();
                    webpage.setTitle(titleText);
                    webpage.setUrl(href);
                    webpage.setSummary(summaryText);
                    if (href != null) {
                        String content = Tools.getHTMLContent(href);
                        webpage.setContent(content);
                    } else {
                        LOG.info("页面正确提取失败");
                    }
                    webpages.add(webpage);
                } else {
                    LOG.error("获取搜索结果列表项出错:" + titleText + " - " + summaryText);
                }
            }
            
            
        } catch (IOException ex) {
            LOG.error("搜索出错",ex);
        }
        searchResult.setWebpages(webpages);;
        return searchResult;
    }
    /**
     * 获取百度搜索结果数
     * 获取如下文本并解析数字：
     * 百度为您找到相关结果约13,200个
     * @param document 文档
     * @return 结果数
     */
    private int getBaiduSearchResultCount(Document document){
        String cssQuery = "html body div div div div.nums";
        LOG.debug("total cssQuery: " + cssQuery);
        Element totalElement = document.select(cssQuery).first();
        String totalText = totalElement.text(); 
        LOG.info("搜索结果文本：" + totalText);
        
        String regEx="[^0-9]";   
        Pattern pattern = Pattern.compile(regEx);      
        Matcher matcher = pattern.matcher(totalText);
        totalText = matcher.replaceAll("");
        int total = Integer.parseInt(totalText);
        LOG.info("搜索结果数：" + total);
        return total;
    }

    public static void main(String[] args) {        
        Searcher searcher = new JSoupBaiduSearcher();
        SearchResult searchResult = searcher.search("杨尚川",1);
        List<Webpage> webpages = searchResult.getWebpages();
        if (webpages != null) {
            int i = 1;
            LOG.info("搜索结果 当前第 " + searchResult.getPage() + " 页，页面大小为：" + searchResult.getPageSize() + " 共有结果数：" + searchResult.getTotal());
            for (Webpage webpage : webpages) {
                LOG.info("搜索结果 " + (i++) + " ：");
                LOG.info("标题：" + webpage.getTitle());
                LOG.info("URL：" + webpage.getUrl());
                LOG.info("摘要：" + webpage.getSummary());
                LOG.info("正文：" + webpage.getContent());
                LOG.info("");
            }
        } else {
            LOG.error("没有搜索到结果");
        }
    }
}

7
顶

1
踩

分享到：

使用Java调用谷歌搜索 | 如何解决BUG？

2013-10-19 01:46
浏览 19712
评论(10)
分类:互联网
查看更多

10 楼 yangshangchuan 2015-04-29

zbsxlsf 写道

好像只能最多只能去一页8条记录啊

有一些特殊的，比如什么百度文库、百度百科、人人网人名搜索结果这类的，这些都需要特殊处理

9 楼 zbsxlsf 2015-04-28

好像只能最多只能去一页8条记录啊

8 楼 cf2huihui 2014-08-25

yangshangchuan 写道

cf2huihui 写道

你好，我用你的titleCssQuery和summaryCssQuery内容解析不出来数据，会报空指针异常，我用自己写的

 String unqiueTitleCssQuery = "div.c-container > .t";

可以解析到数据，可是只能解析标题，其他内容无法在循环块内一起取出来，导致webpage这个类只能有title值，其他都没有。求指教方法，谢谢

百度的页面结构变了，你看我这个项目的提取方法吧：https://github.com/ysc/rank/blob/master/src/main/java/org/seo/rank/impl/BaiduRanker.java

好的，我把项目下载下来研究下，谢谢分享

7 楼 cf2huihui 2014-08-25

yangshangchuan 写道

cf2huihui 写道

经过我的努力，终于获取到数据了，只需要将每个列表获取出来，然后分析每个列表内的html代码，获取标题和链接就可以了。很高兴，同时，谢谢你的源码

不客气，我还有个SEO优化的工具，用于查询搜索引擎的收录排名，其中就有获取百度搜索结果的代码，你看看，是最近几天搞的：https://github.com/ysc/rank

好的，谢谢，我这就去学习下

6 楼 yangshangchuan 2014-08-22

cf2huihui 写道

你好，我用你的titleCssQuery和summaryCssQuery内容解析不出来数据，会报空指针异常，我用自己写的

 String unqiueTitleCssQuery = "div.c-container > .t";

可以解析到数据，可是只能解析标题，其他内容无法在循环块内一起取出来，导致webpage这个类只能有title值，其他都没有。求指教方法，谢谢

百度的页面结构变了，你看我这个项目的提取方法吧：https://github.com/ysc/rank/blob/master/src/main/java/org/seo/rank/impl/BaiduRanker.java

5 楼 yangshangchuan 2014-08-22

cf2huihui 写道

不客气，我还有个SEO优化的工具，用于查询搜索引擎的收录排名，其中就有获取百度搜索结果的代码，你看看，是最近几天搞的：https://github.com/ysc/rank

4 楼 cf2huihui 2014-08-20

3 楼 cf2huihui 2014-08-18

你好，我用你的titleCssQuery和summaryCssQuery内容解析不出来数据，会报空指针异常，我用自己写的

 String unqiueTitleCssQuery = "div.c-container > .t";

可以解析到数据，可是只能解析标题，其他内容无法在循环块内一起取出来，导致webpage这个类只能有title值，其他都没有。求指教方法，谢谢

2 楼 yangshangchuan 2014-05-31

东海龙宫写道

请问下你用的那些jar包啊，程序下载后jar包没有一直运行不了呢，感谢 1124008750@qq.com

看完整的项目：https://github.com/ysc/search-demo

1 楼东海龙宫 2014-05-27

请问下你用的那些jar包啊，程序下载后jar包没有一直运行不了呢，感谢 1124008750@qq.com

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论