Gonna play with Nutch tonight as a possible replacement for my own personal web crawler. Nutch is a java-based web crawler that we may implement for our large crawling process, but not for our ripping/indexing layer.
Followup:
Why I’m not going to be going with Nutch:
REXML could not parse this XML/HTML:
<code>
fetching http://www.everydaybirthday.com/
fetching http://welcome.hp.com/gms/gr/el/sz3/smb/notebooks_tabletpcs.html
fetching http://boomp3.com/listen/fbnoc45_p/am-gold-1970-04-your-song-elton-john
java.lang.NullPointerException
at org.apache.hadoop.fs.FSDataInputStream$Buffer.getPos(FSDataInputStream.java:87)
at org.apache.hadoop.fs.FSDataInputStream.getPos(FSDataInputStream.java:125)
at org.apache.hadoop.io.SequenceFile$Reader.getPosition(SequenceFile.java:1736)
at org.apache.hadoop.mapred.SequenceFileRecordReader.getProgress(SequenceFileRecordReader.java:108)
at org.apache.hadoop.mapred.MapTask$1.getProgress(MapTask.java:165)
at org.apache.hadoop.mapred.MapTask$1.next(MapTask.java:155)
at org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:115)
fetcher caught:java.lang.NullPointerException
java.lang.NullPointerException
at org.apache.hadoop.fs.FSDataInputStream$Buffer.getPos(FSDataInputStream.java:87)
at org.apache.hadoop.fs.FSDataInputStream.getPos(FSDataInputStream.java:125)
at org.apache.hadoop.io.SequenceFile$Reader.getPosition(SequenceFile.java:1736)
at org.apache.hadoop.mapred.SequenceFileRecordReader.getProgress(SequenceFileRecordReader.java:108)
at org.apache.hadoop.mapred.MapTask$1.getProgress(MapTask.java:165)
at org.apache.hadoop.mapred.MapTask$1.next(MapTask.java:155)
at org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:115)
fetcher caught:java.lang.NullPointerException
java.lang.NullPointerException
at org.apache.hadoop.fs.FSDataInputStream$Buffer.getPos(FSDataInputStream.java:87)
at org.apache.hadoop.fs.FSDataInputStream.getPos(FSDataInputStream.java:125)
at org.apache.hadoop.io.SequenceFile$Reader.getPosition(SequenceFile.java:1736)
at org.apache.hadoop.mapred.SequenceFileRecordReader.getProgress(SequenceFileRecordReader.java:108)
at org.apache.hadoop.mapred.MapTask$1.getProgress(MapTask.java:165)
at org.apache.hadoop.mapred.MapTask$1.next(MapTask.java:155)
at org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:115)
fetcher caught:java.lang.NullPointerException
java.lang.NullPointerException
at org.apache.hadoop.fs.FSDataInputStream$Buffer.getPos(FSDataInputStream.java:87)
at org.apache.hadoop.fs.FSDataInputStream.getPos(FSDataInputStream.java:125)
at org.apache.hadoop.io.SequenceFile$Reader.getPosition(SequenceFile.java:1736)
at org.apache.hadoop.mapred.SequenceFileRecordReader.getProgress(SequenceFileRecordReader.java:108)
at org.apache.hadoop.mapred.MapTask$1.getProgress(MapTask.java:165)
at org.apache.hadoop.mapred.MapTask$1.next(MapTask.java:155)
at org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:115)
fetcher caught:java.lang.NullPointerException
java.lang.NullPointerException
at org.apache.hadoop.fs.FSDataInputStream$Buffer.getPos(FSDataInputStream.java:87)
at org.apache.hadoop.fs.FSDataInputStream.getPos(FSDataInputStream.java:125)
at org.apache.hadoop.io.SequenceFile$Reader.getPosition(SequenceFile.java:1736)
at org.apache.hadoop.mapred.SequenceFileRecordReader.getProgress(SequenceFileRecordReader.java:108)
at org.apache.hadoop.mapred.MapTask$1.getProgress(MapTask.java:165)
at org.apache.hadoop.mapred.MapTask$1.next(MapTask.java:155)
at org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:115)
fetcher caught:java.lang.NullPointerException
java.lang.NullPointerException
at org.apache.hadoop.fs.FSDataInputStream$Buffer.getPos(FSDataInputStream.java:87)
at org.apache.hadoop.fs.FSDataInputStream.getPos(FSDataInputStream.java:125)
at org.apache.hadoop.io.SequenceFile$Reader.getPosition(SequenceFile.java:1736)
at org.apache.hadoop.mapred.SequenceFileRecordReader.getProgress(SequenceFileRecordReader.java:108)
at org.apache.hadoop.mapred.MapTask$1.getProgress(MapTask.java:165)
at org.apache.hadoop.mapred.MapTask$1.next(MapTask.java:155)
at org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:115)
fetcher caught:java.lang.NullPointerException
java.lang.NullPointerException
at org.apache.hadoop.fs.FSDataInputStream$Buffer.getPos(FSDataInputStream.java:87)
at org.apache.hadoop.fs.FSDataInputStream.getPos(FSDataInputStream.java:125)
at org.apache.hadoop.io.SequenceFile$Reader.getPosition(SequenceFile.java:1736)
at org.apache.hadoop.mapred.SequenceFileRecordReader.getProgress(SequenceFileRecordReader.java:108)
at org.apache.hadoop.mapred.MapTask$1.getProgress(MapTask.java:165)
at org.apache.hadoop.mapred.MapTask$1.next(MapTask.java:155)
at org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:115)
fetcher caught:java.lang.NullPointerException
Exception in thread "main" java.io.IOException: Job failed!
at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:604)
at org.apache.nutch.fetcher.Fetcher.fetch(Fetcher.java:470)
at org.apache.nutch.crawl.Crawl.main(Crawl.java:124)
REXML could not parse this XML/HTML: </code>Tweet
