Code style : 2 spaces instead of tabs Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/f5adbcc3 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/f5adbcc3 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/f5adbcc3
Branch: refs/heads/master Commit: f5adbcc3c3bb447110b6733e0851b931e57465c3 Parents: 298cffc Author: Thamme Gowda <[email protected]> Authored: Sat Apr 30 17:15:50 2016 -0700 Committer: Thamme Gowda <[email protected]> Committed: Sat Apr 30 17:15:50 2016 -0700 ---------------------------------------------------------------------- .../nutch/tools/AbstractCommonCrawlFormat.java | 694 +++++++++---------- 1 file changed, 347 insertions(+), 347 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/f5adbcc3/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java b/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java index d5a0154..1b425c4 100644 --- a/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java +++ b/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java @@ -43,351 +43,351 @@ import com.ibm.icu.text.SimpleDateFormat; * */ public abstract class AbstractCommonCrawlFormat implements CommonCrawlFormat { - protected static final Logger LOG = LoggerFactory.getLogger(AbstractCommonCrawlFormat.class.getName()); - - protected String url; - - protected Content content; - - protected Metadata metadata; - - protected Configuration conf; - - protected String keyPrefix; - - protected boolean simpleDateFormat; - - protected boolean jsonArray; - - protected boolean reverseKey; - - protected String reverseKeyValue; - - protected List<String> inLinks; - - public AbstractCommonCrawlFormat(String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException { - this.url = url; - this.content = content; - this.metadata = metadata; - this.conf = nutchConf; - - this.keyPrefix = config.getKeyPrefix(); - this.simpleDateFormat = config.getSimpleDateFormat(); - this.jsonArray = config.getJsonArray(); - this.reverseKey = config.getReverseKey(); - this.reverseKeyValue = config.getReverseKeyValue(); - } - - public String getJsonData(String url, Content content, Metadata metadata) - throws IOException { - this.url = url; - this.content = content; - this.metadata = metadata; - - return this.getJsonData(); - } - - public String getJsonData(String url, Content content, Metadata metadata, - ParseData parseData) throws IOException { - - // override of this is required in the actual formats - throw new NotImplementedException(); - } - - @Override - public String getJsonData() throws IOException { - try { - startObject(null); - - // url - writeKeyValue("url", getUrl()); - - // timestamp - writeKeyValue("timestamp", getTimestamp()); - - // request - startObject("request"); - writeKeyValue("method", getMethod()); - startObject("client"); - writeKeyValue("hostname", getRequestHostName()); - writeKeyValue("address", getRequestHostAddress()); - writeKeyValue("software", getRequestSoftware()); - writeKeyValue("robots", getRequestRobots()); - startObject("contact"); - writeKeyValue("name", getRequestContactName()); - writeKeyValue("email", getRequestContactEmail()); - closeObject("contact"); - closeObject("client"); - // start request headers - startHeaders("headers", false, true); - writeKeyValueWrapper("Accept", getRequestAccept()); - writeKeyValueWrapper("Accept-Encoding", getRequestAcceptEncoding()); - writeKeyValueWrapper("Accept-Language", getRequestAcceptLanguage()); - writeKeyValueWrapper("User-Agent", getRequestUserAgent()); - //closeObject("headers"); - closeHeaders("headers", false, true); - writeKeyNull("body"); - closeObject("request"); - - // response - startObject("response"); - writeKeyValue("status", getResponseStatus()); - startObject("server"); - writeKeyValue("hostname", getResponseHostName()); - writeKeyValue("address", getResponseAddress()); - closeObject("server"); - // start response headers - startHeaders("headers", false, true); - writeKeyValueWrapper("Content-Encoding", getResponseContentEncoding()); - writeKeyValueWrapper("Content-Type", getResponseContentType()); - writeKeyValueWrapper("Date", getResponseDate()); - writeKeyValueWrapper("Server", getResponseServer()); - for (String name : metadata.names()) { - if (name.equalsIgnoreCase("Content-Encoding") || name.equalsIgnoreCase("Content-Type") || name.equalsIgnoreCase("Date") || name.equalsIgnoreCase("Server")) { - continue; - } - writeKeyValueWrapper(name, metadata.get(name)); - } - closeHeaders("headers", false, true); - writeKeyValue("body", getResponseContent()); - closeObject("response"); - - // key - if (!this.keyPrefix.isEmpty()) { - this.keyPrefix += "-"; - } - writeKeyValue("key", this.keyPrefix + getKey()); - - // imported - writeKeyValue("imported", getImported()); - - if (getInLinks() != null){ - startArray("inlinks", false, true); - for (String link : getInLinks()) { - writeArrayValue(link); - } - closeArray("inlinks", false, true); - } - closeObject(null); - - return generateJson(); - - } catch (IOException ioe) { - LOG.warn("Error in processing file " + url + ": " + ioe.getMessage()); - throw new IOException("Error in generating JSON:" + ioe.getMessage()); - } - } - - // abstract methods - - protected abstract void writeKeyValue(String key, String value) throws IOException; - - protected abstract void writeKeyNull(String key) throws IOException; - - protected abstract void startArray(String key, boolean nested, boolean newline) throws IOException; - - protected abstract void closeArray(String key, boolean nested, boolean newline) throws IOException; - - protected abstract void writeArrayValue(String value) throws IOException; - - protected abstract void startObject(String key) throws IOException; - - protected abstract void closeObject(String key) throws IOException; - - protected abstract String generateJson() throws IOException; - - // getters - - protected String getUrl() { - try { - return URIUtil.encodePath(url); - } catch (URIException e) { - LOG.error("Can't encode URL " + url); - } - - return url; - } - - protected String getTimestamp() { - if (this.simpleDateFormat) { - String timestamp = null; - try { - long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(ifNullString(metadata.get(Metadata.LAST_MODIFIED))).getTime(); - timestamp = String.valueOf(epoch); - } catch (ParseException pe) { - LOG.warn(pe.getMessage()); - } - return timestamp; - } else { - return ifNullString(metadata.get(Metadata.LAST_MODIFIED)); - } - } - - protected String getMethod() { - return new String("GET"); - } - - protected String getRequestHostName() { - String hostName = ""; - try { - hostName = InetAddress.getLocalHost().getHostName(); - } catch (UnknownHostException uhe) { - - } - return hostName; - } - - protected String getRequestHostAddress() { - String hostAddress = ""; - try { - hostAddress = InetAddress.getLocalHost().getHostAddress(); - } catch (UnknownHostException uhe) { - - } - return hostAddress; - } - - protected String getRequestSoftware() { - return conf.get("http.agent.version", ""); - } - - protected String getRequestRobots() { - return new String("CLASSIC"); - } - - protected String getRequestContactName() { - return conf.get("http.agent.name", ""); - } - - protected String getRequestContactEmail() { - return conf.get("http.agent.email", ""); - } - - protected String getRequestAccept() { - return conf.get("http.accept", ""); - } - - protected String getRequestAcceptEncoding() { - return new String(""); // TODO - } - - protected String getRequestAcceptLanguage() { - return conf.get("http.accept.language", ""); - } - - protected String getRequestUserAgent() { - return conf.get("http.robots.agents", ""); - } - - protected String getResponseStatus() { - return ifNullString(metadata.get("status")); - } - - protected String getResponseHostName() { - return URLUtil.getHost(url); - } - - protected String getResponseAddress() { - return ifNullString(metadata.get("_ip_")); - } - - protected String getResponseContentEncoding() { - return ifNullString(metadata.get("Content-Encoding")); - } - - protected String getResponseContentType() { - return ifNullString(metadata.get("Content-Type")); - } - - public List<String> getInLinks() { - return inLinks; - } - - public void setInLinks(List<String> inLinks) { - this.inLinks = inLinks; - } - - protected String getResponseDate() { - if (this.simpleDateFormat) { - String timestamp = null; - try { - long epoch = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z").parse(ifNullString(metadata.get("Date"))).getTime(); - timestamp = String.valueOf(epoch); - } catch (ParseException pe) { - LOG.warn(pe.getMessage()); - } - return timestamp; - } else { - return ifNullString(metadata.get("Date")); - } - } - - protected String getResponseServer() { - return ifNullString(metadata.get("Server")); - } - - protected String getResponseContent() { - return new String(content.getContent()); - } - - protected String getKey() { - if (this.reverseKey) { - return this.reverseKeyValue; - } - else { - return url; - } - } - - protected String getImported() { - if (this.simpleDateFormat) { - String timestamp = null; - try { - long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(ifNullString(metadata.get("Date"))).getTime(); - timestamp = String.valueOf(epoch); - } catch (ParseException pe) { - LOG.warn(pe.getMessage()); - } - return timestamp; - } else { - return ifNullString(metadata.get("Date")); - } - } - - private static String ifNullString(String value) { - return (value != null) ? value : ""; - } - - private void startHeaders(String key, boolean nested, boolean newline) throws IOException { - if (this.jsonArray) { - startArray(key, nested, newline); - } - else { - startObject(key); - } - } - - private void closeHeaders(String key, boolean nested, boolean newline) throws IOException { - if (this.jsonArray) { - closeArray(key, nested, newline); - } - else { - closeObject(key); - } - } - - private void writeKeyValueWrapper(String key, String value) throws IOException { - if (this.jsonArray) { - startArray(null, true, false); - writeArrayValue(key); - writeArrayValue(value); - closeArray(null, true, false); - } - else { - writeKeyValue(key, value); - } - } - - @Override - public void close() {} + protected static final Logger LOG = LoggerFactory.getLogger(AbstractCommonCrawlFormat.class.getName()); + + protected String url; + + protected Content content; + + protected Metadata metadata; + + protected Configuration conf; + + protected String keyPrefix; + + protected boolean simpleDateFormat; + + protected boolean jsonArray; + + protected boolean reverseKey; + + protected String reverseKeyValue; + + protected List<String> inLinks; + + public AbstractCommonCrawlFormat(String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException { + this.url = url; + this.content = content; + this.metadata = metadata; + this.conf = nutchConf; + + this.keyPrefix = config.getKeyPrefix(); + this.simpleDateFormat = config.getSimpleDateFormat(); + this.jsonArray = config.getJsonArray(); + this.reverseKey = config.getReverseKey(); + this.reverseKeyValue = config.getReverseKeyValue(); + } + + public String getJsonData(String url, Content content, Metadata metadata) + throws IOException { + this.url = url; + this.content = content; + this.metadata = metadata; + + return this.getJsonData(); + } + + public String getJsonData(String url, Content content, Metadata metadata, + ParseData parseData) throws IOException { + + // override of this is required in the actual formats + throw new NotImplementedException(); + } + + @Override + public String getJsonData() throws IOException { + try { + startObject(null); + + // url + writeKeyValue("url", getUrl()); + + // timestamp + writeKeyValue("timestamp", getTimestamp()); + + // request + startObject("request"); + writeKeyValue("method", getMethod()); + startObject("client"); + writeKeyValue("hostname", getRequestHostName()); + writeKeyValue("address", getRequestHostAddress()); + writeKeyValue("software", getRequestSoftware()); + writeKeyValue("robots", getRequestRobots()); + startObject("contact"); + writeKeyValue("name", getRequestContactName()); + writeKeyValue("email", getRequestContactEmail()); + closeObject("contact"); + closeObject("client"); + // start request headers + startHeaders("headers", false, true); + writeKeyValueWrapper("Accept", getRequestAccept()); + writeKeyValueWrapper("Accept-Encoding", getRequestAcceptEncoding()); + writeKeyValueWrapper("Accept-Language", getRequestAcceptLanguage()); + writeKeyValueWrapper("User-Agent", getRequestUserAgent()); + //closeObject("headers"); + closeHeaders("headers", false, true); + writeKeyNull("body"); + closeObject("request"); + + // response + startObject("response"); + writeKeyValue("status", getResponseStatus()); + startObject("server"); + writeKeyValue("hostname", getResponseHostName()); + writeKeyValue("address", getResponseAddress()); + closeObject("server"); + // start response headers + startHeaders("headers", false, true); + writeKeyValueWrapper("Content-Encoding", getResponseContentEncoding()); + writeKeyValueWrapper("Content-Type", getResponseContentType()); + writeKeyValueWrapper("Date", getResponseDate()); + writeKeyValueWrapper("Server", getResponseServer()); + for (String name : metadata.names()) { + if (name.equalsIgnoreCase("Content-Encoding") || name.equalsIgnoreCase("Content-Type") || name.equalsIgnoreCase("Date") || name.equalsIgnoreCase("Server")) { + continue; + } + writeKeyValueWrapper(name, metadata.get(name)); + } + closeHeaders("headers", false, true); + writeKeyValue("body", getResponseContent()); + closeObject("response"); + + // key + if (!this.keyPrefix.isEmpty()) { + this.keyPrefix += "-"; + } + writeKeyValue("key", this.keyPrefix + getKey()); + + // imported + writeKeyValue("imported", getImported()); + + if (getInLinks() != null){ + startArray("inlinks", false, true); + for (String link : getInLinks()) { + writeArrayValue(link); + } + closeArray("inlinks", false, true); + } + closeObject(null); + + return generateJson(); + + } catch (IOException ioe) { + LOG.warn("Error in processing file " + url + ": " + ioe.getMessage()); + throw new IOException("Error in generating JSON:" + ioe.getMessage()); + } + } + + // abstract methods + + protected abstract void writeKeyValue(String key, String value) throws IOException; + + protected abstract void writeKeyNull(String key) throws IOException; + + protected abstract void startArray(String key, boolean nested, boolean newline) throws IOException; + + protected abstract void closeArray(String key, boolean nested, boolean newline) throws IOException; + + protected abstract void writeArrayValue(String value) throws IOException; + + protected abstract void startObject(String key) throws IOException; + + protected abstract void closeObject(String key) throws IOException; + + protected abstract String generateJson() throws IOException; + + // getters + + protected String getUrl() { + try { + return URIUtil.encodePath(url); + } catch (URIException e) { + LOG.error("Can't encode URL " + url); + } + + return url; + } + + protected String getTimestamp() { + if (this.simpleDateFormat) { + String timestamp = null; + try { + long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(ifNullString(metadata.get(Metadata.LAST_MODIFIED))).getTime(); + timestamp = String.valueOf(epoch); + } catch (ParseException pe) { + LOG.warn(pe.getMessage()); + } + return timestamp; + } else { + return ifNullString(metadata.get(Metadata.LAST_MODIFIED)); + } + } + + protected String getMethod() { + return new String("GET"); + } + + protected String getRequestHostName() { + String hostName = ""; + try { + hostName = InetAddress.getLocalHost().getHostName(); + } catch (UnknownHostException uhe) { + + } + return hostName; + } + + protected String getRequestHostAddress() { + String hostAddress = ""; + try { + hostAddress = InetAddress.getLocalHost().getHostAddress(); + } catch (UnknownHostException uhe) { + + } + return hostAddress; + } + + protected String getRequestSoftware() { + return conf.get("http.agent.version", ""); + } + + protected String getRequestRobots() { + return new String("CLASSIC"); + } + + protected String getRequestContactName() { + return conf.get("http.agent.name", ""); + } + + protected String getRequestContactEmail() { + return conf.get("http.agent.email", ""); + } + + protected String getRequestAccept() { + return conf.get("http.accept", ""); + } + + protected String getRequestAcceptEncoding() { + return new String(""); // TODO + } + + protected String getRequestAcceptLanguage() { + return conf.get("http.accept.language", ""); + } + + protected String getRequestUserAgent() { + return conf.get("http.robots.agents", ""); + } + + protected String getResponseStatus() { + return ifNullString(metadata.get("status")); + } + + protected String getResponseHostName() { + return URLUtil.getHost(url); + } + + protected String getResponseAddress() { + return ifNullString(metadata.get("_ip_")); + } + + protected String getResponseContentEncoding() { + return ifNullString(metadata.get("Content-Encoding")); + } + + protected String getResponseContentType() { + return ifNullString(metadata.get("Content-Type")); + } + + public List<String> getInLinks() { + return inLinks; + } + + public void setInLinks(List<String> inLinks) { + this.inLinks = inLinks; + } + + protected String getResponseDate() { + if (this.simpleDateFormat) { + String timestamp = null; + try { + long epoch = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z").parse(ifNullString(metadata.get("Date"))).getTime(); + timestamp = String.valueOf(epoch); + } catch (ParseException pe) { + LOG.warn(pe.getMessage()); + } + return timestamp; + } else { + return ifNullString(metadata.get("Date")); + } + } + + protected String getResponseServer() { + return ifNullString(metadata.get("Server")); + } + + protected String getResponseContent() { + return new String(content.getContent()); + } + + protected String getKey() { + if (this.reverseKey) { + return this.reverseKeyValue; + } + else { + return url; + } + } + + protected String getImported() { + if (this.simpleDateFormat) { + String timestamp = null; + try { + long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(ifNullString(metadata.get("Date"))).getTime(); + timestamp = String.valueOf(epoch); + } catch (ParseException pe) { + LOG.warn(pe.getMessage()); + } + return timestamp; + } else { + return ifNullString(metadata.get("Date")); + } + } + + private static String ifNullString(String value) { + return (value != null) ? value : ""; + } + + private void startHeaders(String key, boolean nested, boolean newline) throws IOException { + if (this.jsonArray) { + startArray(key, nested, newline); + } + else { + startObject(key); + } + } + + private void closeHeaders(String key, boolean nested, boolean newline) throws IOException { + if (this.jsonArray) { + closeArray(key, nested, newline); + } + else { + closeObject(key); + } + } + + private void writeKeyValueWrapper(String key, String value) throws IOException { + if (this.jsonArray) { + startArray(null, true, false); + writeArrayValue(key); + writeArrayValue(value); + closeArray(null, true, false); + } + else { + writeKeyValue(key, value); + } + } + + @Override + public void close() {} }
