要爬取一个网站遇到了极验的验证码,这周都在想着怎么破解这个,网上搜了好多知乎上看到有人问了这问题,我按照这思路去大概实现了一下。
1.使用htmlunit(这种方式我没成功,模拟鼠标拖拽后轨迹没生成,可以跳过)
我用的是java,我首先先想到了用直接用htmlunit,我做了点初始化
private void initWebClient() { if (webClient != null) { return; } webClient = new WebClient(BrowserVersion.FIREFOX_24); webClient.getOptions().setProxyConfig(new ProxyConfig("127.0.0.1",8888)); webClient.getOptions().setActiveXNative(true); webClient.getOptions().setUseInsecureSSL(true); // 配置证书 webClient.getOptions().setJavaScriptEnabled(true); webClient.getOptions().setCssEnabled(true); webClient.setCssErrorHandler(new SilentCssErrorHandler()); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); CookieManager cookieManager = new CookieManager(); ListhttpCookies = client.getCookies();//其方式获取的cookie for (org.apache.http.cookie.Cookie cookie : httpCookies) { cookieManager.addCookie(new com.gargoylesoftware.htmlunit.util.Cookie(cookie)); } webClient.setCookieManager(cookieManager); }