I have been working to create the Common Crawl Index Table from a set of WARC and ARC files crawled back in 2008 and 2012 by the End of Term project.
When I get to the step to create the table I am running into the following exception.
21/12/01 08:14:13 ERROR Executor: Exception in task 5.0 in stage 1.0 (TID 6)
java.lang.StringIndexOutOfBoundsException: begin 165, end -1, length 200
at java.base/java.lang.String.checkBoundsBeginEnd(String.java:3319)
at java.base/java.lang.String.substring(String.java:1874)
at org.commoncrawl.spark.CCIndex2Table.convertCdxLine(CCIndex2Table.java:85)
at org.apache.spark.api.java.JavaPairRDD$.$anonfun$toScalaFunction$1(JavaPairRDD.scala:1070)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
at org.apache.spark.sql.execution.UnsafeExternalRowSorter.sort(UnsafeExternalRowSorter.java:225)
at org.apache.spark.sql.execution.SortExec.$anonfun$doExecute$1(SortExec.scala:119)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829)
dns:carney.house.gov 20090120110809 {"url": "dns:carney.house.gov", "mime": "text/dns", "status": "200", "digest": "IBWQS6J44DZG2DZGC6CAIXYQRYXTELS6", "length": "258", "offset": "57167110", "filename": "crawl-data/EOT-2008/segments/CDL/warc/CDL-20090120105548-00558-dp01.warc.gz"}
dns:fortenberry.houseenews.net 20090120110350 {"url": "dns:fortenberry.houseenews.net", "mime": "text/dns", "status": "200", "digest": "LJ4VR67S75IT22EYXZJA7J767X3HQQLF", "length": "248", "offset": "38197994", "filename": "crawl-data/EOT-2008/segments/CDL/warc/CDL-20090120105548-00558-dp01.warc.gz"}
dns:globalwarming.house.gov 20090120110057 {"url": "dns:globalwarming.house.gov", "mime": "text/dns", "status": "200", "digest": "GYANLSGMLY2ITPOUG26HU5TVBB642Z4Y", "length": "250", "offset": "22612729", "filename": "crawl-data/EOT-2008/segments/CDL/warc/CDL-20090120105548-00558-dp01.warc.gz"}
dns:kentucky.gov 20090120110351 {"url": "dns:kentucky.gov", "mime": "text/dns", "status": "200", "digest": "F2EGUUGHRZWVAFSAQDZLYYAY2PF3ESB7", "length": "241", "offset": "38202422", "filename": "crawl-data/EOT-2008/segments/CDL/warc/CDL-20090120105548-00558-dp01.warc.gz"}
dns:www.campbellcountyky.org 20090120110055 {"url": "dns:www.campbellcountyky.org", "mime": "text/dns", "status": "200", "digest": "G5BQUQGBQYFIB53TELWHXA2HCGMXL7PR", "length": "247", "offset": "22606867", "filename": "crawl-data/EOT-2008/segments/CDL/warc/CDL-20090120105548-00558-dp01.warc.gz"}
dns:www.gold.ky.gov 20090120110351 {"url": "dns:www.gold.ky.gov", "mime": "text/dns", "status": "200", "digest": "EF6MAYAQ4DOV4U2UWBLS3AQ3YNNJSBN4", "length": "247", "offset": "38202175", "filename": "crawl-data/EOT-2008/segments/CDL/warc/CDL-20090120105548-00558-dp01.warc.gz"}
dns:www.governor.ky.gov 20090120110848 {"url": "dns:www.governor.ky.gov", "mime": "text/dns", "status": "200", "digest": "PHWKYRZGU6N3PAZFV3CLY2JFPXK3I22J", "length": "246", "offset": "58189184", "filename": "crawl-data/EOT-2008/segments/CDL/warc/CDL-20090120105548-00558-dp01.warc.gz"}
dns:www.henrycountyky.com 20090120105705 {"url": "dns:www.henrycountyky.com", "mime": "text/dns", "status": "200", "digest": "I5AVBPKCIEU6OR3REJC74LP5PZO7ZGVR", "length": "250", "offset": "11301948", "filename": "crawl-data/EOT-2008/segments/CDL/warc/CDL-20090120105548-00558-dp01.warc.gz"}
dns:www.henrywaxman.house.gov 20090120111339 {"url": "dns:www.henrywaxman.house.gov", "mime": "text/dns", "status": "200", "digest": "77XOVXU2AAQ2KMHWBZLPSHT63JSYLDVC", "length": "265", "offset": "92635984", "filename": "crawl-data/EOT-2008/segments/CDL/warc/CDL-20090120105548-00558-dp01.warc.gz"}
dns:www.kctcs.edu 20090120110848 {"url": "dns:www.kctcs.edu", "mime": "text/dns", "status": "200", "digest": "AXNYBF5LIS7OQIC2PYHAKLUX7FMGI7Z7", "length": "241", "offset": "58189430", "filename": "crawl-data/EOT-2008/segments/CDL/warc/CDL-20090120105548-00558-dp01.warc.gz"}
dns:www.mahoney.house.gov 20090120105608 {"url": "dns:www.mahoney.house.gov", "mime": "text/dns", "status": "200", "digest": "GA5AMHEAJNXYL3YFDT4NL5CJ4ZZTCOE7", "length": "255", "offset": "219509", "filename": "crawl-data/EOT-2008/segments/CDL/warc/CDL-20090120105548-00558-dp01.warc.gz"}
dns:www.aviationsystemsdivision.arc.nasa.gov 20090120111433 {"url": "dns:www.aviationsystemsdivision.arc.nasa.gov", "mime": "text/dns", "status": "200", "digest": "IKBNHMXKBP6YDDHJXE4KZFBZTCIEKXBF", "length": "264", "offset": "69679824", "filename": "crawl-data/EOT-2008/segments/CDL/warc/CDL-20090120111003-01406-dp01.warc.gz"}
dns:www.spaceflight.nasa.gov 20090120111257 {"url": "dns:www.spaceflight.nasa.gov", "mime": "text/dns", "status": "200", "digest": "M6ZGWE4AF6W7WGURTYYKGI6PPDW66VGK", "length": "263", "offset": "51358480", "filename": "crawl-data/EOT-2008/segments/CDL/warc/CDL-20090120111003-01406-dp01.warc.gz"}
dns:www.index.va.gov 20090120114454 {"url": "dns:www.index.va.gov", "mime": "text/dns", "status": "200", "digest": "ZIKPGE2AAW4XHM4UE7ZC3NU3SX3XRLS7", "length": "240", "offset": "57821587", "filename": "crawl-data/EOT-2008/segments/CDL/warc/CDL-20090120111539-00371-vat01.cdlib.org.warc.gz"}
dns:www1.va.gov 20090120113427 {"url": "dns:www1.va.gov", "mime": "text/dns", "status": "200", "digest": "YILNQUZ34QJPCBCUJQXYY6YZ2VRIGNG2", "length": "240", "offset": "31831131", "filename": "crawl-data/EOT-2008/segments/CDL/warc/CDL-20090120111539-00371-vat01.cdlib.org.warc.gz"}
dns:t2.lanl.gov 20090120115128 {"url": "dns:t2.lanl.gov", "mime": "text/dns", "status": "200", "digest": "E7FB5HHFKKODMRLTW65T3U77OAKEZAEB", "length": "237", "offset": "40048624", "filename": "crawl-data/EOT-2008/segments/CDL/warc/CDL-20090120112235-00558-vat01.cdlib.org.warc.gz"}
dns:www-xdiv.lanl.gov 20090120114647 {"url": "dns:www-xdiv.lanl.gov", "mime": "text/dns", "status": "200", "digest": "GT7B5ZW262XVAG5WRCQ57H2YDI5UAKQN", "length": "245", "offset": "35110129", "filename": "crawl-data/EOT-2008/segments/CDL/warc/CDL-20090120112235-00558-vat01.cdlib.org.warc.gz"}