00001
package com.quadcap.http.client;
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
import java.io.*;
00042
00043
import java.util.ArrayList;
00044
import java.util.Collections;
00045
import java.util.HashMap;
00046
import java.util.Iterator;
00047
00048
import org.xml.sax.AttributeList;
00049
import org.xml.sax.DocumentHandler;
00050
import org.xml.sax.DTDHandler;
00051
import org.xml.sax.EntityResolver;
00052
import org.xml.sax.ErrorHandler;
00053
import org.xml.sax.HandlerBase;
00054
import org.xml.sax.InputSource;
00055
import org.xml.sax.Locator;
00056
import org.xml.sax.SAXException;
00057
00058
import com.quadcap.text.sax.Parser;
00059
00060
import com.quadcap.http.util.HeaderParser;
00061
00062
import com.quadcap.util.collections.ArrayQueue;
00063
import com.quadcap.util.collections.DiGraph;
00064
00065
import com.quadcap.util.text.OctetMap;
00066
import com.quadcap.util.text.Scanner;
00067
00068
import com.quadcap.util.Debug;
00069
import com.quadcap.util.Util;
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082 public class LinkChecker implements DocumentHandler {
00083
00084 String
base;
00085
00086
00087 String
urlBase;
00088
00089
00090 String
currentUrl;
00091
00092
00093 DiGraph
links =
new DiGraph();
00094
00095
00096 ArrayQueue
linksToCheck =
new ArrayQueue();
00097
00098
00099 HashMap
allLinks =
new HashMap();
00100
00101
00102 HashMap
linksChecked =
new HashMap();
00103
00104 Parser
parser;
00105 String
host;
00106
00107 public LinkChecker(String url) {
00108
parser =
new Parser();
00109 String s = url;
00110
if (s.startsWith(
"http://")) {
00111 s = url.substring(
"http://".length());
00112 }
00113
int idx = s.indexOf(
'/');
00114
if (idx > 0) s = s.substring(0, idx);
00115
host =
"http://" + s;
00116
push(url, 0);
00117 }
00118
00119 synchronized void push(String url,
int line) {
00120
if (
allLinks.get(url) == null && url.startsWith(
host)) {
00121 System.out.println(
"PUSH " +
trim(
base) +
" -> " +
trim(url));
00122
if (
currentUrl != null) {
00123
links.addArc(
currentUrl +
":" + line, url);
00124 }
00125
allLinks.put(url,
"queued");
00126
linksToCheck.push(url);
00127 }
00128 }
00129
00130 String
trim(String url) {
00131
if (url != null && url.startsWith(
host)) {
00132 url = url.substring(
host.length());
00133 }
00134
return url;
00135 }
00136
00137 public void printBadLinks() {
00138 ArrayList k =
new ArrayList();
00139 Iterator iter =
linksChecked.keySet().iterator();
00140
while (iter.hasNext()) {
00141 String url = iter.next().toString();
00142 String val =
linksChecked.get(url).toString();
00143
if (!val.equals(
"found")) {
00144 Iterator x =
links.getParents(url);
00145 String ref = x.hasNext() ? x.next().toString() :
"";
00146 k.add(trim(ref) +
"\n error: " + trim(url));
00147 }
00148 }
00149 Collections.sort(k);
00150 iter = k.iterator();
00151
while (iter.hasNext()) {
00152 System.out.println(iter.next().toString());
00153 }
00154 System.out.println(
"--------------------\n");
00155 System.out.println(
"" + k.size() +
" errors");
00156 }
00157
00158 public void run() throws Exception {
00159
00160
int cnt = 0;
00161
while (
linksToCheck.size() > 0) {
00162 System.out.print(
"" + (
linksChecked.size()+1) +
" of " +
00163 (
linksToCheck.size() +
linksChecked.size()) +
00164
": ");
00165 String url =
linksToCheck.popBack().toString();
00166
if (
linksChecked.get(url) != null)
continue;
00167 System.out.println(trim(url));
00168
currentUrl = url;
00169 InputStream is = null;
00170
try {
00171 is =
HttpFetcher.fetchStream(url);
00172
Scanner scanner =
new Scanner(is);
00173 HashMap headers =
new HashMap();
00174 scanner.
skipUntil(
OctetMap.wsChars);
00175 scanner.
skipWhile(
OctetMap.wsChars);
00176 String resp = scanner.
parseUntil(
OctetMap.crlfChars);
00177
HeaderParser.parseCRLF(scanner);
00178
HeaderParser.parseHeaders(scanner, headers);
00179
if (!resp.startsWith(
"200")) {
00180
allLinks.put(url,
"missing");
00181
linksChecked.put(url,
"missing");
00182 Iterator iter =
links.getParents(url);
00183 String referrer =
00184 iter.hasNext()
00185 ? iter.next().toString()
00186 :
"---";
00187 System.err.println(
"*** " + trim(url) +
"," +
00188 trim(referrer) +
"," + resp);
00189
continue;
00190 }
00191 String mimeType = (String)headers.get(
"content-type");
00192
if (mimeType == null || !mimeType.equals(
"text/html")) {
00193
continue;
00194 }
00195 InputStreamReader r =
new InputStreamReader(is);
00198
parser.setDocumentHandler(
this);
00199
setBase(url);
00200
parser.parse(in);
00201
allLinks.put(url,
"found");
00202
linksChecked.put(url,
"found");
00203 }
catch (IOException e) {
00204
Debug.print(e);
00205
allLinks.put(url,
"error");
00206
linksChecked.put(url,
"error");
00207 }
catch (Exception e3) {
00208
Debug.print(e3);
00209
allLinks.put(url,
"exception");
00210
linksChecked.put(url,
"exception");
00211 }
catch (Throwable t) {
00212
Debug.print(t);
00213
allLinks.put(url,
"exception");
00214
linksChecked.put(url,
"exception");
00215 } finally {
00216
if (is != null) is.close();
00217
00218 }
00219 }
00220 }
00221
00222 public void setBase(String base) {
00223
this.base = base;
00224
this.urlBase =
parent(base);
00225
if (base.endsWith(
"/"))
urlBase = base;
00226 }
00227
00228 public void startDocument() {
00229 }
00230
00231 public void endDocument() {
00232 }
00233
00234 public void ignorableWhitespace(
char[] ch,
int off,
int cnt)
00236 {
00237
characters(ch, off, cnt);
00238 }
00239
00240 public void processingInstruction(String target, String data) {
00241 }
00242
00244 }
00245
00248 {
00249
try {
00250
if (tag.equalsIgnoreCase(
"a")) {
00251 String href = attrs.getValue(
"href");
00252
if (href != null)
checkHref(href,
parser.getLineNumber());
00253 }
else if (tag.equalsIgnoreCase(
"img") ||
00254 tag.equalsIgnoreCase(
"frame")) {
00255 String href = attrs.getValue(
"src");
00256
if (href != null)
checkHref(href,
parser.getLineNumber());
00257 }
00258 }
catch (Throwable t) {
00259 t.printStackTrace(System.err);
00260
00261 System.err.println(
"tag = " + tag);
00262 System.err.println(
"attrs = " + attrs);
00263 System.err.println(
"urlBase = " +
urlBase);
00264 }
00265 }
00266
00268 }
00269
00271 }
00272
00273 public void checkHref(String href,
int line) {
00274 String tbase =
urlBase;
00275 href = href.trim();
00276
if (href.length() > 0 && href.charAt(0) ==
'/') {
00277 href = href.substring(1);
00278 tbase =
"";
00279 }
else if (href.startsWith(
"http://")) {
00280 tbase =
"";
00281 }
else if (href.startsWith(
"ftp://") ||
00282 href.startsWith(
"mailto:")) {
00283
return;
00284 }
else {
00285
while (href.startsWith(
"./") || href.startsWith(
"../")) {
00286
if (href.startsWith(
"./")) {
00287 href = href.substring(2);
00288 }
else if (href.startsWith(
"../")) {
00289 href = href.substring(3);
00290 tbase =
parent(tbase);
00291 }
00292 }
00293 }
00294 String url = tbase + href;
00295
int idx = url.indexOf(
'#');
00296
if (idx >= 0) {
00297 url = url.substring(0, idx);
00298 }
00299
if (url.length() == 0)
return;
00300 push(url, line);
00301 }
00302
00303 static String
parent(String s) {
00304
for (
int i = s.length() - 2; i >= 0; i--) {
00305
if (s.charAt(i) ==
'/')
return s.substring(0, i+1);
00306 }
00307
throw new RuntimeException(
"Bad parent: " + s);
00308 }
00309 }
00310