[GitHub] nutch pull request: WARC exporter for the CommonCrawlDataDumper

jorgelbg Tue, 15 Sep 2015 06:30:13 -0700

Github user jorgelbg commented on a diff in the pull request:

    https://github.com/apache/nutch/pull/55#discussion_r39509063
  
    --- Diff: src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java ---
    @@ -0,0 +1,337 @@
    +package org.apache.nutch.tools;
    +
    +import java.io.ByteArrayInputStream;
    +import java.io.ByteArrayOutputStream;
    +import java.io.File;
    +import java.io.IOException;
    +import java.net.URI;
    +import java.text.ParseException;
    +import java.util.Arrays;
    +import java.util.Collections;
    +import java.util.Date;
    +import java.util.List;
    +import java.util.concurrent.atomic.AtomicInteger;
    +
    +import com.ibm.icu.text.SimpleDateFormat;
    +import org.apache.commons.lang.NotImplementedException;
    +import org.apache.commons.lang.StringUtils;
    +import org.apache.hadoop.conf.Configuration;
    +import org.apache.nutch.metadata.Metadata;
    +import org.apache.nutch.parse.ParseData;
    +
    +import org.apache.nutch.parse.ParseSegment;
    +import org.apache.nutch.protocol.Content;
    +import org.archive.format.warc.WARCConstants;
    +import org.archive.io.WriterPoolMember;
    +import org.archive.io.warc.WARCRecordInfo;
    +import org.archive.io.warc.WARCWriter;
    +import org.archive.io.warc.WARCWriterPoolSettingsData;
    +import org.archive.uid.UUIDGenerator;
    +import org.archive.util.DateUtils;
    +import org.archive.util.anvl.ANVLRecord;
    +
    +public class CommonCrawlFormatWARC extends AbstractCommonCrawlFormat {
    +
    +  public static final String MAX_WARC_FILE_SIZE = "warc.file.size.max";
    +  public static final String TEMPLATE = 
"${prefix}-${timestamp17}-${serialno}";
    +
    +  private static final AtomicInteger SERIALNO = new AtomicInteger();
    +  private final static UUIDGenerator GENERATOR = new UUIDGenerator();
    +
    +  private String outputDir = null;
    +  private ByteArrayOutputStream out;
    +  private WARCWriter writer;
    +  private ParseData parseData;
    +
    +  public CommonCrawlFormatWARC(Configuration nutchConf,
    +      CommonCrawlConfig config) throws IOException {
    +    super(null, null, null, nutchConf, config);
    +
    +    this.out = new ByteArrayOutputStream();
    +
    +    ANVLRecord info = WARCUtils.getWARCInfoContent(nutchConf);
    +    List<String> md = Collections.singletonList(info.toString());
    +
    +    this.outputDir = config.getOutputDir();
    +
    +    if (null == outputDir) {
    +      String message = "Missing output directory configuration: " + 
outputDir;
    +
    +      throw new RuntimeException(message);
    +    }
    +
    +    File file = new File(outputDir);
    +
    +    long maxSize = WARCConstants.DEFAULT_MAX_WARC_FILE_SIZE;
    +
    +    if (config.getWarcSize() > 0) {
    +      maxSize = config.getWarcSize();
    +    }
    +
    +    WARCWriterPoolSettingsData settings = new WARCWriterPoolSettingsData(
    +        WriterPoolMember.DEFAULT_PREFIX, TEMPLATE, maxSize,
    +        config.isCompressed(), Arrays.asList(new File[] { file }), md,
    +        new UUIDGenerator());
    +
    +    writer = new WARCWriter(SERIALNO, settings);
    +  }
    +
    +  public CommonCrawlFormatWARC(String url, Content content, Metadata 
metadata,
    +      Configuration nutchConf, CommonCrawlConfig config, ParseData 
parseData)
    +      throws IOException {
    +    super(url, content, metadata, nutchConf, config);
    +
    +    this.out = new ByteArrayOutputStream();
    +    this.parseData = parseData;
    +
    +    ANVLRecord info = WARCUtils.getWARCInfoContent(conf);
    +    List<String> md = Collections.singletonList(info.toString());
    +
    +    this.outputDir = config.getOutputDir();
    +
    +    if (null == outputDir) {
    +      String message = "Missing output directory configuration: " + 
outputDir;
    +
    +      throw new RuntimeException(message);
    +    }
    +
    +    File file = new File(outputDir);
    +
    +    long maxSize = WARCConstants.DEFAULT_MAX_WARC_FILE_SIZE;
    +
    +    if (config.getWarcSize() > 0) {
    +      maxSize = config.getWarcSize();
    +    }
    +
    +    WARCWriterPoolSettingsData settings = new WARCWriterPoolSettingsData(
    +        WriterPoolMember.DEFAULT_PREFIX, TEMPLATE, maxSize,
    +        config.isCompressed(), Arrays.asList(new File[] { file }), md,
    +        new UUIDGenerator());
    +
    +    writer = new WARCWriter(SERIALNO, settings);
    +  }
    +
    +  public String getJsonData(String url, Content content, Metadata metadata,
    +      ParseData parseData) throws IOException {
    +    this.url = url;
    +    this.content = content;
    +    this.metadata = metadata;
    +    this.parseData = parseData;
    +
    +    return this.getJsonData();
    +  }
    +
    +  @Override
    +  public String getJsonData() throws IOException {
    +
    +    long position = writer.getPosition();
    +
    +    try {
    +      // See if we need to open a new file because we've exceeded maxBytes
    +
    +      // checkSize will open a new file if we exceeded the maxBytes setting
    +      writer.checkSize();
    +
    +      if (writer.getPosition() != position) {
    +        // We just closed the file because it was larger than maxBytes.
    +        position = writer.getPosition();
    +      }
    +
    +      // response record
    +      URI id = writeResponse();
    +
    +      // request record
    +      writeRequest(id);
    --- End diff --
    
    :+1:



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

[GitHub] nutch pull request: WARC exporter for the CommonCrawlDataDumper

Reply via email to