CHUKWA-749. Added Chukwa tags to Solr schema for indexing logs. (Eric Yang)
Project: http://git-wip-us.apache.org/repos/asf/chukwa/repo Commit: http://git-wip-us.apache.org/repos/asf/chukwa/commit/2d20ab59 Tree: http://git-wip-us.apache.org/repos/asf/chukwa/tree/2d20ab59 Diff: http://git-wip-us.apache.org/repos/asf/chukwa/diff/2d20ab59 Branch: refs/heads/master Commit: 2d20ab592b0db8abcad09f616e803f085dcb407a Parents: b2bcd77 Author: Eric Yang <[email protected]> Authored: Thu Jun 18 18:19:05 2015 -0700 Committer: Eric Yang <[email protected]> Committed: Thu Jun 18 18:19:05 2015 -0700 ---------------------------------------------------------------------- CHANGES.txt | 2 + contrib/solr/logs/conf/schema.xml | 8 ++- .../datacollection/writer/solr/SolrWriter.java | 52 ++++++++++++++++---- 3 files changed, 51 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/chukwa/blob/2d20ab59/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index a8c032d..26b9aa9 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -14,6 +14,8 @@ Trunk (unreleased changes) IMPROVEMENTS + CHUKWA-749. Added Chukwa tags to Solr schema for indexing logs. (Eric Yang) + CHUKWA-754. Improved graph explorer selection box performance. (Eric Yang) CHUKWA-745. Improved chart configuration management. (Eric Yang) http://git-wip-us.apache.org/repos/asf/chukwa/blob/2d20ab59/contrib/solr/logs/conf/schema.xml ---------------------------------------------------------------------- diff --git a/contrib/solr/logs/conf/schema.xml b/contrib/solr/logs/conf/schema.xml index 2a22977..07daad1 100644 --- a/contrib/solr/logs/conf/schema.xml +++ b/contrib/solr/logs/conf/schema.xml @@ -111,12 +111,15 @@ installations. See the <uniqueKey> declaration below where <uniqueKey> is set to "id". --> <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" /> - <field name="seqId" type="string" indexed="true" stored="true" omitNorms="true"/> - <field name="type" type="text_general" indexed="true" stored="true" omitNorms="true"/> + <field name="seqId" type="long" indexed="true" stored="true" required="true" multiValued="false" /> + <field name="type" type="string" indexed="true" stored="true" omitNorms="true"/> <field name="stream" type="string" indexed="true" stored="true" multiValued="true"/> <field name="tags" type="text_en_splitting" indexed="true" stored="true" multiValued="true"/> <field name="source" type="string" indexed="true" stored="true"/> <field name="data" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" /> + <field name="service" type="string" indexed="true" stored="true" multiValued="true" omitNorms="true" /> + <field name="user" type="string" indexed="true" stored="true" multiValued="false" omitNorms="true" /> + <field name="date" type="pdate" indexed="true" stored="true" multiValued="false" omitNorms="true" termVectors="true" /> <!-- catchall field, containing all other searchable text fields (implemented via copyField further on in this schema --> @@ -204,6 +207,7 @@ <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" positionIncrementGap="0"/> <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" positionIncrementGap="0"/> <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" positionIncrementGap="0"/> + <fieldType name="pdate" class="solr.DateField" sortMissingLast="true" /> <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and is a more restricted form of the canonical representation of dateTime http://git-wip-us.apache.org/repos/asf/chukwa/blob/2d20ab59/src/main/java/org/apache/hadoop/chukwa/datacollection/writer/solr/SolrWriter.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/hadoop/chukwa/datacollection/writer/solr/SolrWriter.java b/src/main/java/org/apache/hadoop/chukwa/datacollection/writer/solr/SolrWriter.java index 2644eb8..f67fe87 100644 --- a/src/main/java/org/apache/hadoop/chukwa/datacollection/writer/solr/SolrWriter.java +++ b/src/main/java/org/apache/hadoop/chukwa/datacollection/writer/solr/SolrWriter.java @@ -17,7 +17,14 @@ */ package org.apache.hadoop.chukwa.datacollection.writer.solr; +import java.text.DateFormat; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Date; import java.util.List; +import java.util.TimeZone; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.hadoop.chukwa.Chunk; import org.apache.hadoop.chukwa.datacollection.agent.ChukwaAgent; @@ -33,13 +40,18 @@ import org.apache.solr.common.SolrInputDocument; public class SolrWriter extends PipelineableWriter { private static Logger log = Logger.getLogger(SolrWriter.class); private static CloudSolrServer server; - private static String ID = "id"; - private static String SEQ_ID = "seqId"; - private static String DATA_TYPE = "type"; - private static String STREAM_NAME = "stream"; - private static String TAGS = "tags"; - private static String SOURCE = "source"; - private static String DATA = "data"; + private final static String ID = "id"; + private final static String SEQ_ID = "seqId"; + private final static String DATA_TYPE = "type"; + private final static String STREAM_NAME = "stream"; + private final static String TAGS = "tags"; + private final static String SOURCE = "source"; + private final static String DATA = "data"; + private final static String USER = "user"; + private final static String SERVICE = "service"; + private final static String DATE = "date"; + private final static Pattern userPattern = Pattern.compile("user=(.+?)[, ]"); + private SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss,SSS"); public SolrWriter() throws WriterException { init(ChukwaAgent.getStaticConfiguration()); @@ -73,11 +85,33 @@ public class SolrWriter extends PipelineableWriter { doc.addField(SEQ_ID, chunk.getSeqID()); doc.addField(DATA_TYPE, chunk.getDataType()); doc.addField(DATA, new String(chunk.getData())); + + // TODO: improve parsing logic for more sophisticated tagging + String data = new String(chunk.getData()); + Matcher m = userPattern.matcher(data); + if(m.find()) { + doc.addField(USER, m.group(1)); + } + if(data.contains("hdfs")) { + doc.addField(SERVICE, "hdfs"); + } + if(data.contains("yarn")) { + doc.addField(SERVICE, "yarn"); + } + if(data.contains("mapredice")) { + doc.addField(SERVICE, "mapreduce"); + } + try { + Date d = sdf.parse(data); + doc.addField(DATE, d, 1.0f); + } catch(ParseException e) { + + } server.add(doc); server.commit(); } catch (Exception e) { - log.error(ExceptionUtil.getStackTrace(e)); - throw new WriterException("Failed to store data to Solr Cloud."); + log.warn("Failed to store data to Solr Cloud."); + log.warn(ExceptionUtil.getStackTrace(e)); } } if (next != null) {
