Quadcap Embeddable Server

com/quadcap/http/client/LinkChecker.java

Go to the documentation of this file.
00001 package com.quadcap.http.client; 00002 00003 /* Copyright 1998 - 2003 Quadcap Software. All rights reserved. 00004 * 00005 * This software is distributed under the Quadcap Free Software License. 00006 * This software may be used or modified for any purpose, personal or 00007 * commercial. Open Source redistributions are permitted. Commercial 00008 * redistribution of larger works derived from, or works which bundle 00009 * this software requires a "Commercial Redistribution License"; see 00010 * http://www.quadcap.com/purchase. 00011 * 00012 * Redistributions qualify as "Open Source" under one of the following terms: 00013 * 00014 * Redistributions are made at no charge beyond the reasonable cost of 00015 * materials and delivery. 00016 * 00017 * Redistributions are accompanied by a copy of the Source Code or by an 00018 * irrevocable offer to provide a copy of the Source Code for up to three 00019 * years at the cost of materials and delivery. Such redistributions 00020 * must allow further use, modification, and redistribution of the Source 00021 * Code under substantially the same terms as this license. 00022 * 00023 * Redistributions of source code must retain the copyright notices as they 00024 * appear in each source code file, these license terms, and the 00025 * disclaimer/limitation of liability set forth as paragraph 6 below. 00026 * 00027 * Redistributions in binary form must reproduce this Copyright Notice, 00028 * these license terms, and the disclaimer/limitation of liability set 00029 * forth as paragraph 6 below, in the documentation and/or other materials 00030 * provided with the distribution. 00031 * 00032 * The Software is provided on an "AS IS" basis. No warranty is 00033 * provided that the Software is free of defects, or fit for a 00034 * particular purpose. 00035 * 00036 * Limitation of Liability. Quadcap Software shall not be liable 00037 * for any damages suffered by the Licensee or any third party resulting 00038 * from use of the Software. 00039 */ 00040 00041 import java.io.*; 00042 00043 import java.util.ArrayList; 00044 import java.util.Collections; 00045 import java.util.HashMap; 00046 import java.util.Iterator; 00047 00048 import org.xml.sax.AttributeList; 00049 import org.xml.sax.DocumentHandler; 00050 import org.xml.sax.DTDHandler; 00051 import org.xml.sax.EntityResolver; 00052 import org.xml.sax.ErrorHandler; 00053 import org.xml.sax.HandlerBase; 00054 import org.xml.sax.InputSource; 00055 import org.xml.sax.Locator; 00056 import org.xml.sax.SAXException; 00057 00058 import com.quadcap.text.sax.Parser; 00059 00060 import com.quadcap.http.util.HeaderParser; 00061 00062 import com.quadcap.util.collections.ArrayQueue; 00063 import com.quadcap.util.collections.DiGraph; 00064 00065 import com.quadcap.util.text.OctetMap; 00066 import com.quadcap.util.text.Scanner; 00067 00068 import com.quadcap.util.Debug; 00069 import com.quadcap.util.Util; 00070 00071 00072 /** 00073 * This class implements a simple link checker, following links 00074 * in the following tags: 00075 * 00076 * <ul> 00077 * <li><b>&lt;A HREF=""&gt; 00078 * <li><b>&lt;IMG SRC=""&gt; 00079 * <li><b>&lt;FRAME SRC=""&gt; 00080 * </ul> 00081 */ 00082 public class LinkChecker implements DocumentHandler { 00083 /** uri of the document we're currently fetching and parsing */ 00084 String base; 00085 00086 /** base uri of the current document for relative href resolution */ 00087 String urlBase; 00088 00089 /** uri of the document we're currently fetching and parsing */ 00090 String currentUrl; 00091 00092 /** directed graph of all links found so far (even bad ones...) */ 00093 DiGraph links = new DiGraph(); 00094 00095 /** queue of links to check */ 00096 ArrayQueue linksToCheck = new ArrayQueue(); 00097 00098 /** uri -> status for all links */ 00099 HashMap allLinks = new HashMap(); 00100 00101 /** uri -> status for completed links */ 00102 HashMap linksChecked = new HashMap(); 00103 00104 Parser parser; 00105 String host; 00106 00107 public LinkChecker(String url) { 00108 parser = new Parser(); 00109 String s = url; 00110 if (s.startsWith("http://")) { 00111 s = url.substring("http://".length()); 00112 } 00113 int idx = s.indexOf('/'); 00114 if (idx > 0) s = s.substring(0, idx); 00115 host = "http://" + s; 00116 push(url, 0); 00117 } 00118 00119 synchronized void push(String url, int line) { 00120 if (allLinks.get(url) == null && url.startsWith(host)) { 00121 System.out.println("PUSH " + trim(base) + " -> " + trim(url)); 00122 if (currentUrl != null) { 00123 links.addArc(currentUrl + ":" + line, url); 00124 } 00125 allLinks.put(url, "queued"); 00126 linksToCheck.push(url); 00127 } 00128 } 00129 00130 String trim(String url) { 00131 if (url != null && url.startsWith(host)) { 00132 url = url.substring(host.length()); 00133 } 00134 return url; 00135 } 00136 00137 public void printBadLinks() { 00138 ArrayList k = new ArrayList(); 00139 Iterator iter = linksChecked.keySet().iterator(); 00140 while (iter.hasNext()) { 00141 String url = iter.next().toString(); 00142 String val = linksChecked.get(url).toString(); 00143 if (!val.equals("found")) { 00144 Iterator x = links.getParents(url); 00145 String ref = x.hasNext() ? x.next().toString() : ""; 00146 k.add(trim(ref) + "\n error: " + trim(url)); 00147 } 00148 } 00149 Collections.sort(k); 00150 iter = k.iterator(); 00151 while (iter.hasNext()) { 00152 System.out.println(iter.next().toString()); 00153 } 00154 System.out.println("--------------------\n"); 00155 System.out.println("" + k.size() + " errors"); 00156 } 00157 00158 public void run() throws Exception { 00159 //HtmlParser parser = new HtmlParser(); 00160 int cnt = 0; 00161 while (linksToCheck.size() > 0) { 00162 System.out.print("" + (linksChecked.size()+1) + " of " + 00163 (linksToCheck.size() + linksChecked.size()) + 00164 ": "); 00165 String url = linksToCheck.popBack().toString(); 00166 if (linksChecked.get(url) != null) continue; 00167 System.out.println(trim(url)); 00168 currentUrl = url; 00169 InputStream is = null; 00170 try { 00171 is = HttpFetcher.fetchStream(url); 00172 Scanner scanner = new Scanner(is); 00173 HashMap headers = new HashMap(); 00174 scanner.skipUntil(OctetMap.wsChars); 00175 scanner.skipWhile(OctetMap.wsChars); 00176 String resp = scanner.parseUntil(OctetMap.crlfChars); 00177 HeaderParser.parseCRLF(scanner); 00178 HeaderParser.parseHeaders(scanner, headers); 00179 if (!resp.startsWith("200")) { 00180 allLinks.put(url, "missing"); 00181 linksChecked.put(url, "missing"); 00182 Iterator iter = links.getParents(url); 00183 String referrer = 00184 iter.hasNext() 00185 ? iter.next().toString() 00186 : "---"; 00187 System.err.println("*** " + trim(url) + "," + 00188 trim(referrer) + "," + resp); 00189 continue; 00190 } 00191 String mimeType = (String)headers.get("content-type"); 00192 if (mimeType == null || !mimeType.equals("text/html")) { 00193 continue; 00194 } 00195 InputStreamReader r = new InputStreamReader(is); 00198 parser.setDocumentHandler(this); 00199 setBase(url); 00200 parser.parse(in); 00201 allLinks.put(url, "found"); 00202 linksChecked.put(url, "found"); 00203 } catch (IOException e) { 00204 Debug.print(e); 00205 allLinks.put(url, "error"); 00206 linksChecked.put(url, "error"); 00207 } catch (Exception e3) { 00208 Debug.print(e3); 00209 allLinks.put(url, "exception"); 00210 linksChecked.put(url, "exception"); 00211 } catch (Throwable t) { 00212 Debug.print(t); 00213 allLinks.put(url, "exception"); 00214 linksChecked.put(url, "exception"); 00215 } finally { 00216 if (is != null) is.close(); 00217 //System.out.println("Result: " + allLinks.get(url)); 00218 } 00219 } 00220 } 00221 00222 public void setBase(String base) { 00223 this.base = base; 00224 this.urlBase = parent(base); 00225 if (base.endsWith("/")) urlBase = base; 00226 } 00227 00228 public void startDocument() { 00229 } 00230 00231 public void endDocument() { 00232 } 00233 00234 public void ignorableWhitespace(char[] ch, int off, int cnt) 00236 { 00237 characters(ch, off, cnt); 00238 } 00239 00240 public void processingInstruction(String target, String data) { 00241 } 00242 00244 } 00245 00248 { 00249 try { 00250 if (tag.equalsIgnoreCase("a")) { 00251 String href = attrs.getValue("href"); 00252 if (href != null) checkHref(href, parser.getLineNumber()); 00253 } else if (tag.equalsIgnoreCase("img") || 00254 tag.equalsIgnoreCase("frame")) { 00255 String href = attrs.getValue("src"); 00256 if (href != null) checkHref(href, parser.getLineNumber()); 00257 } 00258 } catch (Throwable t) { 00259 t.printStackTrace(System.err); 00260 //System.err.println(t.toString()); 00261 System.err.println("tag = " + tag); 00262 System.err.println("attrs = " + attrs); 00263 System.err.println("urlBase = " + urlBase); 00264 } 00265 } 00266 00268 } 00269 00271 } 00272 00273 public void checkHref(String href, int line) { 00274 String tbase = urlBase; 00275 href = href.trim(); 00276 if (href.length() > 0 && href.charAt(0) == '/') { 00277 href = href.substring(1); 00278 tbase = ""; 00279 } else if (href.startsWith("http://")) { 00280 tbase = ""; 00281 } else if (href.startsWith("ftp://") || 00282 href.startsWith("mailto:")) { 00283 return; 00284 } else { 00285 while (href.startsWith("./") || href.startsWith("../")) { 00286 if (href.startsWith("./")) { 00287 href = href.substring(2); 00288 } else if (href.startsWith("../")) { 00289 href = href.substring(3); 00290 tbase = parent(tbase); 00291 } 00292 } 00293 } 00294 String url = tbase + href; 00295 int idx = url.indexOf('#'); 00296 if (idx >= 0) { 00297 url = url.substring(0, idx); 00298 } 00299 if (url.length() == 0) return; 00300 push(url, line); 00301 } 00302 00303 static String parent(String s) { 00304 for (int i = s.length() - 2; i >= 0; i--) { 00305 if (s.charAt(i) == '/') return s.substring(0, i+1); 00306 } 00307 throw new RuntimeException("Bad parent: " + s); 00308 } 00309 } 00310