Github user jorgelbg commented on a diff in the pull request:
https://github.com/apache/nutch/pull/55#discussion_r39509063
--- Diff: src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java ---
@@ -0,0 +1,337 @@
+package org.apache.nutch.tools;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import java.text.ParseException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Date;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.ibm.icu.text.SimpleDateFormat;
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.ParseData;
+
+import org.apache.nutch.parse.ParseSegment;
+import org.apache.nutch.protocol.Content;
+import org.archive.format.warc.WARCConstants;
+import org.archive.io.WriterPoolMember;
+import org.archive.io.warc.WARCRecordInfo;
+import org.archive.io.warc.WARCWriter;
+import org.archive.io.warc.WARCWriterPoolSettingsData;
+import org.archive.uid.UUIDGenerator;
+import org.archive.util.DateUtils;
+import org.archive.util.anvl.ANVLRecord;
+
+public class CommonCrawlFormatWARC extends AbstractCommonCrawlFormat {
+
+ public static final String MAX_WARC_FILE_SIZE = "warc.file.size.max";
+ public static final String TEMPLATE =
"${prefix}-${timestamp17}-${serialno}";
+
+ private static final AtomicInteger SERIALNO = new AtomicInteger();
+ private final static UUIDGenerator GENERATOR = new UUIDGenerator();
+
+ private String outputDir = null;
+ private ByteArrayOutputStream out;
+ private WARCWriter writer;
+ private ParseData parseData;
+
+ public CommonCrawlFormatWARC(Configuration nutchConf,
+ CommonCrawlConfig config) throws IOException {
+ super(null, null, null, nutchConf, config);
+
+ this.out = new ByteArrayOutputStream();
+
+ ANVLRecord info = WARCUtils.getWARCInfoContent(nutchConf);
+ List<String> md = Collections.singletonList(info.toString());
+
+ this.outputDir = config.getOutputDir();
+
+ if (null == outputDir) {
+ String message = "Missing output directory configuration: " +
outputDir;
+
+ throw new RuntimeException(message);
+ }
+
+ File file = new File(outputDir);
+
+ long maxSize = WARCConstants.DEFAULT_MAX_WARC_FILE_SIZE;
+
+ if (config.getWarcSize() > 0) {
+ maxSize = config.getWarcSize();
+ }
+
+ WARCWriterPoolSettingsData settings = new WARCWriterPoolSettingsData(
+ WriterPoolMember.DEFAULT_PREFIX, TEMPLATE, maxSize,
+ config.isCompressed(), Arrays.asList(new File[] { file }), md,
+ new UUIDGenerator());
+
+ writer = new WARCWriter(SERIALNO, settings);
+ }
+
+ public CommonCrawlFormatWARC(String url, Content content, Metadata
metadata,
+ Configuration nutchConf, CommonCrawlConfig config, ParseData
parseData)
+ throws IOException {
+ super(url, content, metadata, nutchConf, config);
+
+ this.out = new ByteArrayOutputStream();
+ this.parseData = parseData;
+
+ ANVLRecord info = WARCUtils.getWARCInfoContent(conf);
+ List<String> md = Collections.singletonList(info.toString());
+
+ this.outputDir = config.getOutputDir();
+
+ if (null == outputDir) {
+ String message = "Missing output directory configuration: " +
outputDir;
+
+ throw new RuntimeException(message);
+ }
+
+ File file = new File(outputDir);
+
+ long maxSize = WARCConstants.DEFAULT_MAX_WARC_FILE_SIZE;
+
+ if (config.getWarcSize() > 0) {
+ maxSize = config.getWarcSize();
+ }
+
+ WARCWriterPoolSettingsData settings = new WARCWriterPoolSettingsData(
+ WriterPoolMember.DEFAULT_PREFIX, TEMPLATE, maxSize,
+ config.isCompressed(), Arrays.asList(new File[] { file }), md,
+ new UUIDGenerator());
+
+ writer = new WARCWriter(SERIALNO, settings);
+ }
+
+ public String getJsonData(String url, Content content, Metadata metadata,
+ ParseData parseData) throws IOException {
+ this.url = url;
+ this.content = content;
+ this.metadata = metadata;
+ this.parseData = parseData;
+
+ return this.getJsonData();
+ }
+
+ @Override
+ public String getJsonData() throws IOException {
+
+ long position = writer.getPosition();
+
+ try {
+ // See if we need to open a new file because we've exceeded maxBytes
+
+ // checkSize will open a new file if we exceeded the maxBytes setting
+ writer.checkSize();
+
+ if (writer.getPosition() != position) {
+ // We just closed the file because it was larger than maxBytes.
+ position = writer.getPosition();
+ }
+
+ // response record
+ URI id = writeResponse();
+
+ // request record
+ writeRequest(id);
--- End diff --
:+1:
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---