详细Java批量获取微信公众号方法(5)

发布时间：2021-02-21 17:01 所属栏目：12 来源：网络整理

导读：处理跳转向微信注入js的方法： public String getWxHis() { String url = ""; // TODO Auto-generated method stub /** * 当前页面为公众号历史消息时，读取这个程序 * 在采集队列表中有一个load字段，当值等于1时

处理跳转向微信注入js的方法：

public String getWxHis() {
    String url = "";
    // TODO Auto-generated method stub
    /**
     * 当前页面为公众号历史消息时，读取这个程序
     * 在采集队列表中有一个load字段，当值等于1时代表正在被读取
     * 首先删除采集队列表中load=1的行
     * 然后从队列表中任意select一行
     */
    tmpListMapper.deleteByLoad(1);
    TmpList queue = tmpListMapper.selectRandomOne();
    System.out.println("queue is null?"+queue);
    if(queue == null){//队列表为空
      /**
       * 队列表如果空了，就从存储公众号biz的表中取得一个biz，
       * 这里我在公众号表中设置了一个采集时间的time字段，按照正序排列之后，
       * 就得到时间戳最小的一个公众号记录，并取得它的biz
       */
      WeiXin weiXin = weiXinMapper.selectOne();
      
      String biz = weiXin.getBiz();
      url = "https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=" + biz + 
          "#wechat_redirect";//拼接公众号历史消息url地址（第二种页面形式）
      //更新刚才提到的公众号表中的采集时间time字段为当前时间戳。
      weiXin.setCollect(System.currentTimeMillis());
      int result = weiXinMapper.updateByPrimaryKey(weiXin);
      System.out.println("getHis weiXin updateResult:"+result);
    }else{
      //取得当前这一行的content_url字段
      url = queue.getContentUrl();
      //将load字段update为1
      tmpListMapper.updateByContentUrl(url);
    }
    //将下一个将要跳转的$url变成js脚本，由anyproxy注入到微信页面中。
    //echo "<script>setTimeout(function(){window.location.href='".$url."';},2000);</script>";
    int randomTime = new Random().nextInt(3) + 3;
    String jsCode = "<script>setTimeout(function(){window.location.href='"+url+"';},"+randomTime*1000+");</script>";
    return jsCode;
  }

以上就是对处理代理服务器拦截到的数据进行处理的程序。这里有一个需要注意的问题，程序会对数据库中的每个收录的公众号进行轮循访问，甚至是已经存储的文章也会再次访问，目的是为了一直更新文章的阅读数和点赞数。如果需要抓取大量的公众号建议对添加任务队列的代码进行修改，添加条件限制，否则公众号一多轮循抓取重复数据将十分影响效率。

至此就将微信公众号的文章链接全部爬取到，而且这个链接是永久有效而且可以在浏览器打开的链接，接下来就是写爬虫程序从数据库中拿链接爬取文章内容等信息了。

我是用webmagic写的爬虫，轻量好用。

public class SpiderModel implements PageProcessor{
  
  private static PostMapper postMapper;
  
  private static List<Post> posts;
  
  // 抓取网站的相关配置，包括编码、抓取间隔、重试次数等
  private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
  
  public Site getSite() {
    // TODO Auto-generated method stub
    return this.site;
  }
  
  public void process(Page page) {
    // TODO Auto-generated method stub
    Post post = posts.remove(0);
    String content = page.getHtml().xpath("//div[@id='js_content']").get();
    //存在和谐文章 此处做判定如果有直接删除记录或设置表示位表示文章被和谐
    if(content == null){
      System.out.println("文章已和谐！");
      //postMapper.deleteByPrimaryKey(post.getId());
      return;
    }
    String contentSnap = content.replaceAll("data-src","src").replaceAll("preview.html","player.html");//快照
    String contentTxt = HtmlToWord.stripHtml(content);//纯文本内容
    
    Selectable metaContent = page.getHtml().xpath("//div[@id='meta_content']");
    String pubTime = null;
    String wxname = null;
    String author = null;
    if(metaContent != null){
      pubTime = metaContent.xpath("//em[@id='post-date']").get();
      if(pubTime != null){
        pubTime = HtmlToWord.stripHtml(pubTime);//文章发布时间
      }
      wxname = metaContent.xpath("//a[@id='post-user']").get();
      if(wxname != null){
        wxname = HtmlToWord.stripHtml(wxname);//公众号名称
      }
      author = metaContent.xpath("//em[@class='rich_media_meta rich_media_meta_text' and @id!='post-date']").get();
      if(author != null){
        author = HtmlToWord.stripHtml(author);//文章作者
      }
    }
    
//    System.out.println("发布时间:"+pubTime);
//    System.out.println("公众号名称:"+wxname);
//    System.out.println("文章作者:"+author);
    
    String title = post.getTitle().replaceAll(" ","");//文章标题
    String digest = post.getDigest();//文章摘要
    int likeNum = post.getLikenum();//文章点赞数
    int readNum = post.getReadnum();//文章阅读数
    String contentUrl = post.getContentUrl();//文章链接
    
    WechatInfoBean wechatBean = new WechatInfoBean();
    wechatBean.setTitle(title);
    wechatBean.setContent(contentTxt);//纯文本内容
    wechatBean.setSourceCode(contentSnap);//快照
    wechatBean.setLikeCount(likeNum);
    wechatBean.setViewCount(readNum);
    wechatBean.setAbstractText(digest);//摘要
    wechatBean.setUrl(contentUrl);
    wechatBean.setPublishTime(pubTime);
    wechatBean.setSiteName(wxname);//站点名称 公众号名称
    wechatBean.setAuthor(author);
    wechatBean.setMediaType("微信公众号");//来源媒体类型
    
    WechatStorage.saveWechatInfo(wechatBean);
    
    //标示文章已经被爬取
    post.setIsSpider(1);
    postMapper.updateByPrimaryKey(post);
    
  }  
  
  public static void startSpider(List<Post> inposts,PostMapper myPostMapper,String... urls){
    
    long startTime,endTime;
    startTime = System.currentTimeMillis();
    postMapper = myPostMapper;
    posts = inposts;
    
    HttpClientDownloader httpClientDownloader = new HttpClientDownloader();    
    SpiderModel spiderModel = new SpiderModel();
    Spider mySpider = Spider.create(spiderModel).addUrl(urls);
    mySpider.setDownloader(httpClientDownloader);
    try {
      SpiderMonitor.instance().register(mySpider);
      mySpider.thread(1).run();
    } catch (JMException e) {
      e.printStackTrace();
    }
    
    endTime = System.currentTimeMillis();
    System.out.println("爬取时间" + ((endTime - startTime) / 1000) + "秒--");
    
  }
  
}

其它的一些无关逻辑的存储数据代码就不贴了，这里我把代理服务器抓取到的数据存在了mysql，把自己的爬虫程序爬到的数据存储在了mongodb。

下面是自己爬取到的公众号号的信息：

详细Java批量获取微信公众号方法

（编辑：ASP站长网）