public static void duplicateImage(String sourceURL, String imgSrc, String filename) throws Exception { URL source = new URL(imgSrc); HttpURLConnection urlconn = (HttpURLConnection) source.openConnection(); HttpURLConnection.setFollowRedirects(false); if (imgSrc.indexOf("xuite") >= 0) { String cookieVal = urlconn.getHeaderField("Set-Cookie"); // get server response. String sessionId = ""; if (cookieVal != null) { sessionId = cookieVal.substring(0, cookieVal.indexOf(";")); } urlconn.disconnect(); urlconn = (HttpURLConnection) source.openConnection(); // restart a new connection. if (sessionId != null) { urlconn.setRequestProperty("Cookie", sessionId); } urlconn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.9 Safari/525.19"); urlconn.setRequestProperty("Accept-Language", "zh-TW,zh,en-US,en"); urlconn.setRequestProperty("Accept-Charset", "Big5,*,utf-8"); urlconn.setRequestProperty("Accept-Encoding", "gzip,deflate,bzip2"); urlconn.setRequestProperty("Host", "7.blog.xuite.net"); urlconn.setRequestProperty("Connection", "Keep-Alive"); } else if (imgSrc.indexOf("wretch") >= 0) { urlconn.setRequestProperty("Referer", sourceURL); } BufferedInputStream input = new BufferedInputStream(urlconn.getInputStream()); FileOutputStream output = new FileOutputStream("./img/" + filename); System.out.print("download file \"" + filename + "\" --> "); byte[] buff = new byte[1024]; // buffer byte array. int c = 0; double total = urlconn.getContentLength(); // file size. double now = 0; while ((c = input.read(buff)) != -1) { output.write(buff, 0, c); now += c; System.out.print((int) ((now / total) * 100) + "% > "); // print progress. } output.flush(); input.close(); // close stream. output.close(); System.out.print("complete.\n"); }
xuite: 解決方法就是先取得server的session, 再度於請求時發送, 並用表頭欺騙過去, 如果你不知道表頭怎麼設定的話, 可以使用SmartSniff等封包監聽工具先手動下載一次圖片探測出表頭檔再依樣畫葫蘆填回去就好。如果不填的話像 User-Agent 會變成你jdk的版本, 對方server會攔截掉...
無名: server會查詢上一次的來源, 所以如果你直接開啟圖片會被拒絕, 這時我們在表頭上加上referer為原始網頁, 到時候你解析出來的圖片就可以下載了歐!!
解析網頁中圖片有很多種方式, 當然你要先下載全HTML文字回來, 我的方式是使用Regular Expression:
String searchImgReg = "(?x)(src|SRC|background|BACKGROUND)=('|\")(http://([\\w-]+\\.)+[\\w-]+(:[0-9]+)*(/[\\w-]+)*(/[\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")"; Pattern pattern = Pattern.compile(searchImgReg); // use regular expression to parse image src. Matcher matcher = pattern.matcher(html text); while (matcher.find()) // loop and get target string. { try { String imgurl = matcher.group(3); String filename = matcher.group(7).substring(1, matcher.group(7).length()); System.out.println("source: " + imgurl); duplicateImage(imgurl, filename); } catch (Exception e) { e.printStackTrace(); } }
設定host的建議不錯!
回覆刪除