http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.manual/src/main/webapp/querydata.conf ---------------------------------------------------------------------- diff --git a/extras/rya.manual/src/main/webapp/querydata.conf b/extras/rya.manual/src/main/webapp/querydata.conf new file mode 100644 index 0000000..1f93944 --- /dev/null +++ b/extras/rya.manual/src/main/webapp/querydata.conf @@ -0,0 +1,115 @@ +h1. Query Data + +There are a few mechanisms to query data + +h2. Web JSP endpoint + +Open a url to {{http://server/web.rya/sparqlQuery.jsp}}. This simple form can run Sparql. + +h2. Web REST endpoint + +The War sets up a Web REST endpoint at {{http://server/web.rya/queryrdf}} that allows GET requests with queries. + +For this sample, we will assume you already loaded data from the [loaddata.html] tutorial + +Save this file somewhere $RDF_DATA + +Second, use the following Java code to load data to the REST endpoint: +{code} +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.URL; +import java.net.URLConnection; +import java.net.URLEncoder; + +public class QueryDataServletRun { + + public static void main(String[] args) { + try { + String query = "select * where {\n" + + "<http://mynamespace/ProductType1> ?p ?o.\n" + + "}"; + + String queryenc = URLEncoder.encode(query, "UTF-8"); + + URL url = new URL("http://server/rdfTripleStore/queryrdf?query=" + queryenc); + URLConnection urlConnection = url.openConnection(); + urlConnection.setDoOutput(true); + + BufferedReader rd = new BufferedReader(new InputStreamReader( + urlConnection.getInputStream())); + String line; + while ((line = rd.readLine()) != null) { + System.out.println(line); + } + rd.close(); + } catch (Exception e) { + e.printStackTrace(); + } + } +} +{code} + +Compile and run this code above, changing the url that your Rdf War is running at. + +h2. Direct Code + +Here is a code snippet for directly running against Accumulo with the code. You will need at least accumulo.rya.jar, rya.api, rya.sail.impl on the classpath and transitive dependencies. I find that Maven is the easiest way to get a project dependency tree set up. + +{code} + Connector connector = new ZooKeeperInstance("cbinstance", "zkserver:port").getConnector("cbuser", "cbpassword"); + + final RdfCloudTripleStore store = new RdfCloudTripleStore(); + AccumuloRyaDAO crdfdao = new AccumuloRyaDAO(); + crdfdao.setConnector(connector); + + AccumuloRdfConfiguration conf = new AccumuloRdfConfiguration(); + conf.setTablePrefix("rts_"); + conf.setDisplayQueryPlan(true); + crdfdao.setConf(conf); + store.setRdfDao(crdfdao); + + ProspectorServiceEvalStatsDAO evalDao = new ProspectorServiceEvalStatsDAO(connector, conf); + evalDao.init(); + store.setRdfEvalStatsDAO(evalDao); + + InferenceEngine inferenceEngine = new InferenceEngine(); + inferenceEngine.setRdfDao(crdfdao); + inferenceEngine.setConf(conf); + store.setInferenceEngine(inferenceEngine); + + Repository myRepository = new RyaSailRepository(store); + myRepository.initialize(); + + String query = "select * where {\n" + + "<http://mynamespace/ProductType1> ?p ?o.\n" + + "}"; + RepositoryConnection conn = myRepository.getConnection(); + System.out.println(query); + TupleQuery tupleQuery = conn.prepareTupleQuery( + QueryLanguage.SPARQL, query); + ValueFactory vf = ValueFactoryImpl.getInstance(); + + TupleQueryResultHandler writer = new SPARQLResultsXMLWriter(System.out); + tupleQuery.evaluate(new TupleQueryResultHandler() { + + int count = 0; + + @Override + public void startQueryResult(List<String> strings) throws TupleQueryResultHandlerException { + } + + @Override + public void endQueryResult() throws TupleQueryResultHandlerException { + } + + @Override + public void handleSolution(BindingSet bindingSet) throws TupleQueryResultHandlerException { + System.out.println(bindingSet); + } + }); + + conn.close(); + myRepository.shutDown(); +{code} +
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.manual/src/main/webapp/quickstart.conf ---------------------------------------------------------------------- diff --git a/extras/rya.manual/src/main/webapp/quickstart.conf b/extras/rya.manual/src/main/webapp/quickstart.conf new file mode 100644 index 0000000..ce0d20b --- /dev/null +++ b/extras/rya.manual/src/main/webapp/quickstart.conf @@ -0,0 +1,42 @@ +h1. Quick Start + +This tutorial will outline the steps needed to get quickly started with the Rya store using the web based endpoint. + +h2. Prerequisites + +* Columnar Store (either Accumulo or Cloudbase) The tutorial will go forward using Accumulo +* Rya code (Git: [email protected]:texeltek/rya.git) +* Maven 2.2 + + +h2. Building from Source + +Using Git, pull down the latest code from the url above. + +Run the command to build the code {{mvn clean install}} + +If all goes well, the build should be successful and a war should be produced in {{web/web.rya/target/web.rya.war}} + +h2. Deployment + +(Using tomcat) + +Unwar the above war into the webapps directory. + +To point the web.rya war to the appropriate Accumulo instance, make a properties file {{environment.properties}} and put it in the classpath. Here is an example: +{code} +instance.name=accumulo #Accumulo instance name +instance.zk=localhost:2181 #Accumulo Zookeepers +instance.username=root #Accumulo username +instance.password=secret #Accumulo pwd +rya.tableprefix=triplestore_ #Rya Table Prefix +rya.displayqueryplan=true #To display the query plan +{code} + +Start the Tomcat server. {{./bin/startup.sh}} + +h2. Usage + +First, we need to load data. See the [Load Data Section|./loaddata] + +Second, we need to query that data. See the [Query Data Section|./querydata] + http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.manual/src/main/webapp/sm-addauth.conf ---------------------------------------------------------------------- diff --git a/extras/rya.manual/src/main/webapp/sm-addauth.conf b/extras/rya.manual/src/main/webapp/sm-addauth.conf new file mode 100644 index 0000000..371d8da --- /dev/null +++ b/extras/rya.manual/src/main/webapp/sm-addauth.conf @@ -0,0 +1,98 @@ +h1. Add Authentication + +This tutorial will give a few examples on how to load and query data with authentication. + +This is only available for Cloudbase and Accumulo because they provide the security filters necessary to do row level authentication and visibility. + +h2. Load Data with Visibilities + +During the Load process, there are a few ways to set the Column Visibility you want set on each of the corresponding rdf rows. + +h3. Global Visibility + +You can set the Column Visibility globally on the RdfCloudTripleStore, and it will use that particular value for every row saved. + +To do this, once you create and set up the RdfCloudTripleStore, just set the property on the store configuration: + +{code} +//setup +final RdfCloudTripleStore store = new RdfCloudTripleStore(); +AccumuloRyaDAO crdfdao = new AccumuloRyaDAO(); +crdfdao.setConnector(connector); + +AccumuloRdfConfiguration conf = new AccumuloRdfConfiguration(); +conf.setTablePrefix("rts_"); +conf.setDisplayQueryPlan(true); + +//set global column Visibility +conf.setCv("AUTH1|AUTH2"); + +crdfdao.setConf(conf); +store.setRdfDao(crdfdao); +{code} + +The format is simply the same as the Column Visibility format. + +h3. Per triple or document based Visibility + +TODO: Not available as of yet + +h2. Query Data with Authentication + +Attaching an Authentication to the query process is very simple. It requires just adding the property {{RdfCloudTripleStoreConfiguration.CONF_QUERY_AUTH}} to the query {{BindingSet}} +Example: + +{code} +//setup +Connector connector = new ZooKeeperInstance("cbinstance", "zkserver:port").getConnector("cbuser", "cbpassword"); +final RdfCloudTripleStore store = new RdfCloudTripleStore(); +AccumuloRyaDAO crdfdao = new AccumuloRyaDAO(); +crdfdao.setConnector(connector); + +AccumuloRdfConfiguration conf = new AccumuloRdfConfiguration(); +conf.setTablePrefix("rts_"); +conf.setDisplayQueryPlan(true); +crdfdao.setConf(conf); +//set global column Visibility +conf.setCv("1|2"); +store.setRdfDao(crdfdao); + +InferenceEngine inferenceEngine = new InferenceEngine(); +inferenceEngine.setRdfDao(crdfdao); +inferenceEngine.setConf(conf); +store.setInferenceEngine(inferenceEngine); + +Repository myRepository = new RyaSailRepository(store); +myRepository.initialize(); +RepositoryConnection conn = myRepository.getConnection(); + +//define and add statement +String litdupsNS = "urn:test:litdups#"; +URI cpu = vf.createURI(litdupsNS, "cpu"); +URI loadPerc = vf.createURI(litdupsNS, "loadPerc"); +URI uri1 = vf.createURI(litdupsNS, "uri1"); +conn.add(cpu, loadPerc, uri1); +conn.commit(); + +//query with auth +String query = "select * where {" + + "<" + cpu.toString() + "> ?p ?o1." + + "}"; +TupleQuery tupleQuery = conn.prepareTupleQuery(QueryLanguage.SPARQL, query); +tupleQuery.setBinding(RdfCloudTripleStoreConfiguration.CONF_QUERY_AUTH, vf.createLiteral("2")); +TupleQueryResult result = tupleQuery.evaluate(); +while(result.hasNext()) { + System.out.println(result.next()); +} +result.close(); + +//close +conn.close(); +myRepository.shutDown(); +{code} + +Or you can set a global auth using the configuration: + +{code} +conf.setAuth("2") +{code} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.manual/src/main/webapp/sm-firststeps.conf ---------------------------------------------------------------------- diff --git a/extras/rya.manual/src/main/webapp/sm-firststeps.conf b/extras/rya.manual/src/main/webapp/sm-firststeps.conf new file mode 100644 index 0000000..04a56b9 --- /dev/null +++ b/extras/rya.manual/src/main/webapp/sm-firststeps.conf @@ -0,0 +1,55 @@ +h1. Typical First Steps + +In this tutorial, I will give you a quick overview of some of the first steps I perform to get data loaded and read for query. + +h2. Prerequisites + + We are assuming Accumulo 1.4+ usage here. + + * Rya Source Code {{web.rya.war}}) + * Accumulo on top of Hadoop 0.20+ + * RDF Data (in N-Triples format, this format is the easiest to bulk load) + +h2. Building Source + +Skip this section if you already have the Map Reduce artifact and the WAR + +See the [Build From Source Section|./build-source.html] to get the appropriate artifacts built + +h2. Load Data + +I find that the best way to load the data is through the Bulk Load Map Reduce job. + +# Save the RDF Data above onto HDFS. From now on we will refer to this location as <RDF_HDFS_LOCATION> +# Move the cloudbase.rya-<version>-job.jar onto the hadoop cluster +# Bulk load the data. Here is a sample command line: +{code} +hadoop jar ../cloudbase.rya-2.0.0-SNAPSHOT-job.jar BulkNtripsInputTool -Drdf.tablePrefix=lubm_ -Dcb.username=cbuser -Dcb.pwd=cbpwd -Dcb.instance=cbinstance -Dcb.zk=zookeeperLocation -Drdf.format=N-Triples <RDF_HDFS_LOCATION> +{code} + +Once the data is loaded, it is actually a good practice to compact your tables. You can do this by opening the cloudbase shell {{cbshell}} and running the {{compact}} command on the generated tables. Remember the generated tables will be prefixed by the {{rdf.tablePrefix}} property you assigned above. The default tablePrefix is {{rts}}. +Here is a sample cloudbase shell command: +{code} +compact -p lubm_(.*) +{code} + +See the [Load Data Section|./loaddata.html] for more options on loading rdf data + +h2. Run the Statistics Optimizer + +For the best query performance, it is recommended to run the Statistics Optimizer to create the Evaluation Statistics table. This job will read through your data and gather statistics on the distribution of the dataset. This table is then queried before query execution to reorder queries based on the data distribution. + +See the [Evaluation Statistics Table Section|eval.html] on how to do this. + +h2. Query data + +I find the easiest way to query is just to use the WAR. Load the WAR into your favorite web application container and go to the sparqlQuery.jsp page. Example: +{code} +http://localhost:8080/web.rya/sparqlQuery.jsp +{code} + +This page provides a very simple text box for running queries against the store and getting data back. (SPARQL queries) + +Remember to update the connection information in the WAR: {{WEB-INF/spring/spring-cloudbase.xml}} + +See the [Query data section|./querydata.html] for more information. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.manual/src/main/webapp/sm-infer.conf ---------------------------------------------------------------------- diff --git a/extras/rya.manual/src/main/webapp/sm-infer.conf b/extras/rya.manual/src/main/webapp/sm-infer.conf new file mode 100644 index 0000000..ab25281 --- /dev/null +++ b/extras/rya.manual/src/main/webapp/sm-infer.conf @@ -0,0 +1,313 @@ +h1. Inferencing + +The Rdf store provides simple inferencing currently. The supported list of inferred relationships include: + +* rdfs:subClassOf +* rdfs:subPropertyOf +* owl:EquivalentProperty +* owl:inverseOf +* owl:SymmetricProperty +* owl:TransitiveProperty (This is currently in beta and will not work for every case) + +h2. Setup + +The Inferencing Engine is a scheduled job that runs by default every 5 minutes, this is configurable, to query the relationships in the store and develop the inferred graphs necessary to answer inferencing questions. + +This also means that if you load a model into the store, it could take up to 5 minutes for the inferred relationships to be available. + +As usual you will need to set up your {{RdfCloudTripleStore}} with the correct DAO, notice we add an {{InferencingEngine}} as well to the store. If this is not added, then no inferencing will be done on the queries: + +{code} +//setup +Connector connector = new ZooKeeperInstance("cbinstance", "zkserver:port").getConnector("cbuser", "cbpassword"); +final RdfCloudTripleStore store = new RdfCloudTripleStore(); +AccumuloRyaDAO crdfdao = new AccumuloRyaDAO(); +crdfdao.setConnector(connector); + +AccumuloRdfConfiguration conf = new AccumuloRdfConfiguration(); +conf.setTablePrefix("rts_"); +conf.setDisplayQueryPlan(true); +crdfdao.setConf(conf); +store.setRdfDao(crdfdao); + +ProspectorServiceEvalStatsDAO evalDao = new ProspectorServiceEvalStatsDAO(connector, conf); +evalDao.init(); +store.setRdfEvalStatsDAO(evalDao); + +InferenceEngine inferenceEngine = new InferenceEngine(); +inferenceEngine.setRdfDao(crdfdao); +inferenceEngine.setConf(conf); +store.setInferenceEngine(inferenceEngine); + +Repository myRepository = new RyaSailRepository(store); +myRepository.initialize(); +RepositoryConnection conn = myRepository.getConnection(); + +//query code goes here + +//close +conn.close(); +myRepository.shutDown(); +{code} + +h2. Samples + +We will go through some quick samples on loading inferred relationships, seeing and diagnosing the query plan, and checking the data + +h3. Rdfs:SubClassOf + +First the code, which will load the following subclassof relationship: {{UndergraduateStudent subclassof Student subclassof Person}}. Then we will load into the tables three triples defining {{UgradA rdf:type UndergraduateStudent, StudentB rdf:type Student, PersonC rdf:type Person}} + +{code} +conn.add(new StatementImpl(vf.createURI(litdupsNS, "UndergraduateStudent"), RDFS.SUBCLASSOF, vf.createURI(litdupsNS, "Student"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "Student"), RDFS.SUBCLASSOF, vf.createURI(litdupsNS, "Person"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "UgradA"), RDF.TYPE, vf.createURI(litdupsNS, "UndergraduateStudent"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "StudentB"), RDF.TYPE, vf.createURI(litdupsNS, "Student"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "PersonC"), RDF.TYPE, vf.createURI(litdupsNS, "Person"))); +conn.commit(); +{code} + +Remember that once the model is committed, it may take up to 5 minutes for the inferred relationships to be ready. Though you can override this property in the {{InferencingEngine}}. + +We shall run the following query: + +{code} +PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> +PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> +PREFIX lit: <urn:test:litdups#> +select * where {?s rdf:type lit:Person.} +{code} + +And should get back the following results: +{code} +[s=urn:test:litdups#StudentB] +[s=urn:test:litdups#PersonC] +[s=urn:test:litdups#UgradA] +{code} + +h4. How it works + +Let us look at the query plan: +{code} +QueryRoot + Projection + ProjectionElemList + ProjectionElem "s" + Join + FixedStatementPattern + Var (name=79f261ee-e930-4af1-bc09-e637cc0affef) + Var (name=c-79f261ee-e930-4af1-bc09-e637cc0affef, value=http://www.w3.org/2000/01/rdf-schema#subClassOf) + Var (name=-const-2, value=urn:test:litdups#Person, anonymous) + DoNotExpandSP + Var (name=s) + Var (name=-const-1, value=http://www.w3.org/1999/02/22-rdf-syntax-ns#type, anonymous) + Var (name=79f261ee-e930-4af1-bc09-e637cc0affef) +{code} + +Basically, we first find out (through the InferencingEngine) what triples have subclassof with Person. The InferencingEngine will do the graph analysis to find the both Student and UndergraduateStudent are Person classes. +Then this information is joined with the statement pattern {{(?s rdf:type ?inf)}} where {{?inf}} is the results from the InferencingEngine. + +h3. Rdfs:SubPropertyOf + +SubPropertyOf defines that a property can be an instance of another property. For example, a {{gradDegreeFrom subPropertyOf degreeFrom}}. + +Also, EquivalentProperty can be thought of as specialized SubPropertyOf relationship where if {{propA equivalentProperty propB}} then that means that {{propA subPropertyOf propB AND propB subPropertyOf propA}} + +Sample Code: +{code} +conn.add(new StatementImpl(vf.createURI(litdupsNS, "undergradDegreeFrom"), RDFS.SUBPROPERTYOF, vf.createURI(litdupsNS, "degreeFrom"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "gradDegreeFrom"), RDFS.SUBPROPERTYOF, vf.createURI(litdupsNS, "degreeFrom"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "degreeFrom"), RDFS.SUBPROPERTYOF, vf.createURI(litdupsNS, "memberOf"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "memberOf"), RDFS.SUBPROPERTYOF, vf.createURI(litdupsNS, "associatedWith"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "UgradA"), vf.createURI(litdupsNS, "undergradDegreeFrom"), vf.createURI(litdupsNS, "Harvard"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "GradB"), vf.createURI(litdupsNS, "gradDegreeFrom"), vf.createURI(litdupsNS, "Yale"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "ProfessorC"), vf.createURI(litdupsNS, "memberOf"), vf.createURI(litdupsNS, "Harvard"))); +conn.commit(); +{code} + +With query: +{code} +PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> +PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> +PREFIX lit: <urn:test:litdups#> +select * where {?s lit:memberOf lit:Harvard.} +{code} + +Will return results: +{code} +[s=urn:test:litdups#UgradA] +[s=urn:test:litdups#ProfessorC] +{code} + +Since UgradA has undergraduateDegreeFrom Harvard and ProfessorC is memberOf Harvard. + +h4. How it works + +This is very similar to the subClassOf relationship above. Basically the InferencingEngine provides what properties are subPropertyOf relationships with memberOf, and the second part of the Join checks to see if those properties are predicates with object "Harvard". + +Query Plan: +{code} +QueryRoot + Projection + ProjectionElemList + ProjectionElem "s" + Join + FixedStatementPattern + Var (name=0bad69f3-4769-4293-8318-e828b23dc52a) + Var (name=c-0bad69f3-4769-4293-8318-e828b23dc52a, value=http://www.w3.org/2000/01/rdf-schema#subPropertyOf) + Var (name=-const-1, value=urn:test:litdups#memberOf, anonymous) + DoNotExpandSP + Var (name=s) + Var (name=0bad69f3-4769-4293-8318-e828b23dc52a) + Var (name=-const-2, value=urn:test:litdups#Harvard, anonymous) +{code} + +h3. InverseOf + +InverseOf defines a property that is an inverse relation of another property. For example, a student who has a {{degreeFrom}} a University also means that the University {{hasAlumnus}} student. + +Code: +{code} +conn.add(new StatementImpl(vf.createURI(litdupsNS, "degreeFrom"), OWL.INVERSEOF, vf.createURI(litdupsNS, "hasAlumnus"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "UgradA"), vf.createURI(litdupsNS, "degreeFrom"), vf.createURI(litdupsNS, "Harvard"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "GradB"), vf.createURI(litdupsNS, "degreeFrom"), vf.createURI(litdupsNS, "Harvard"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "Harvard"), vf.createURI(litdupsNS, "hasAlumnus"), vf.createURI(litdupsNS, "AlumC"))); +conn.commit(); +{code} + +Query: +{code} +PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> +PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> +PREFIX lit: <urn:test:litdups#> +select * where {lit:Harvard lit:hasAlumnus ?s.} +{code} + +Result: +{code} +[s=urn:test:litdups#AlumC] +[s=urn:test:litdups#GradB] +[s=urn:test:litdups#UgradA] +{code} + +h4. How it works + +The query planner will expand the statement pattern {{Harvard hasAlumnus ?s}} to a Union between {{Harvard hasAlumnus ?s. and ?s degreeFrom Harvard}} + +As a caveat, it is important to note that in general Union queries do not have the best performance, so having a property that has an inverseOf and subPropertyOf, could cause a query plan that might take long depending on how the query planner orders the joins. + +Query Plan +{code} +QueryRoot + Projection + ProjectionElemList + ProjectionElem "s" + InferUnion + StatementPattern + Var (name=-const-1, value=urn:test:litdups#Harvard, anonymous) + Var (name=-const-2, value=urn:test:litdups#hasAlumnus, anonymous) + Var (name=s) + StatementPattern + Var (name=s) + Var (name=-const-2, value=urn:test:litdups#degreeFrom) + Var (name=-const-1, value=urn:test:litdups#Harvard, anonymous) +{code} + +h3. SymmetricProperty + +SymmetricProperty defines a relationship where, for example, if Bob is a friendOf Jeff, then Jeff is a friendOf Bob. (Hopefully) + +Code: +{code} +conn.add(new StatementImpl(vf.createURI(litdupsNS, "friendOf"), RDF.TYPE, OWL.SYMMETRICPROPERTY)); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "Bob"), vf.createURI(litdupsNS, "friendOf"), vf.createURI(litdupsNS, "Jeff"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "James"), vf.createURI(litdupsNS, "friendOf"), vf.createURI(litdupsNS, "Jeff"))); +conn.commit(); +{code} + +Query: +{code} +PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> +PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> +PREFIX lit: <urn:test:litdups#> +select * where {?s lit:friendOf lit:Bob.} +{code} + +Results: +{code} +[s=urn:test:litdups#Jeff] +{code} + +h4. How it works + +The query planner will recognize that {{friendOf}} is a SymmetricProperty and devise a Union to find the specified relationship and inverse. + +Query Plan: +{code} +QueryRoot + Projection + ProjectionElemList + ProjectionElem "s" + InferUnion + StatementPattern + Var (name=s) + Var (name=-const-1, value=urn:test:litdups#friendOf, anonymous) + Var (name=-const-2, value=urn:test:litdups#Bob, anonymous) + StatementPattern + Var (name=-const-2, value=urn:test:litdups#Bob, anonymous) + Var (name=-const-1, value=urn:test:litdups#friendOf, anonymous) + Var (name=s) +{code} + +h3. TransitiveProperty + +TransitiveProperty provides a transitive relationship between resources. For example, if Queens is subRegionOf NYC and NYC is subRegionOf NY, then Queens is transitively a subRegionOf NY. + +Code: +{code} +conn.add(new StatementImpl(vf.createURI(litdupsNS, "subRegionOf"), RDF.TYPE, OWL.TRANSITIVEPROPERTY)); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "Queens"), vf.createURI(litdupsNS, "subRegionOf"), vf.createURI(litdupsNS, "NYC"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "NYC"), vf.createURI(litdupsNS, "subRegionOf"), vf.createURI(litdupsNS, "NY"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "NY"), vf.createURI(litdupsNS, "subRegionOf"), vf.createURI(litdupsNS, "US"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "US"), vf.createURI(litdupsNS, "subRegionOf"), vf.createURI(litdupsNS, "NorthAmerica"))); +conn.add(new StatementImpl(vf.createURI(litdupsNS, "NorthAmerica"), vf.createURI(litdupsNS, "subRegionOf"), vf.createURI(litdupsNS, "World"))); +conn.commit(); +{code} + +Query: +{code} +PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> +PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> +PREFIX lit: <urn:test:litdups#> +select * where {?s lit:subRegionOf lit:NorthAmerica.} +{code} + +Results: +{code} +[s=urn:test:litdups#Queens] +[s=urn:test:litdups#NYC] +[s=urn:test:litdups#NY] +[s=urn:test:litdups#US] +{code} + +h4. How it works + +The TransitiveProperty relationship works by running recursive queries till all the results are returned. + +It is important to note that certain TransitiveProperty relationships will not work: +* Open ended property: ?s subRegionOf ?o (At least one of the properties must be filled or will be filled as the query gets answered) +* Closed property: Queens subRegionOf NY (At least one of the properties must be empty) + +We are working on fixing these issues. + +Query Plan: +{code} +QueryRoot + Projection + ProjectionElemList + ProjectionElem "s" + TransitivePropertySP + Var (name=s) + Var (name=-const-1, value=urn:test:litdups#subRegionOf, anonymous) + Var (name=-const-2, value=urn:test:litdups#NorthAmerica, anonymous) +{code} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.manual/src/main/webapp/sm-namedgraph.conf ---------------------------------------------------------------------- diff --git a/extras/rya.manual/src/main/webapp/sm-namedgraph.conf b/extras/rya.manual/src/main/webapp/sm-namedgraph.conf new file mode 100644 index 0000000..ce2a0da --- /dev/null +++ b/extras/rya.manual/src/main/webapp/sm-namedgraph.conf @@ -0,0 +1,129 @@ +h1. Named Graphs + +Named graphs are supported simply in the Rdf Store in a few ways. OpenRdf supports sending {{contexts}} as each triple is saved. + +h2. Simple Named Graph Load and Query + +Here is a very simple example of using the API to Insert data in named graphs and querying with Sparql + +First we will define a Trig document to load +Trig document +{code} +@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . +@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . +@prefix swp: <http://www.w3.org/2004/03/trix/swp-1/> . +@prefix dc: <http://purl.org/dc/elements/1.1/> . +@prefix ex: <http://www.example.org/vocabulary#> . +@prefix : <http://www.example.org/exampleDocument#> . +:G1 { :Monica ex:name "Monica Murphy" . + :Monica ex:homepage <http://www.monicamurphy.org> . + :Monica ex:email <mailto:[email protected]> . + :Monica ex:hasSkill ex:Management } + +:G2 { :Monica rdf:type ex:Person . + :Monica ex:hasSkill ex:Programming } + +:G4 { :Phobe ex:name "Phobe Buffet" } + +:G3 { :G1 swp:assertedBy _:w1 . + _:w1 swp:authority :Chris . + _:w1 dc:date "2003-10-02"^^xsd:date . + :G2 swp:quotedBy _:w2 . + :G4 swp:assertedBy _:w2 . + _:w2 dc:date "2003-09-03"^^xsd:date . + _:w2 swp:authority :Tom . + :Chris rdf:type ex:Person . + :Chris ex:email <mailto:[email protected]>. + :Tom rdf:type ex:Person . + :Tom ex:email <mailto:[email protected]>} +{code} + +We will assume that this file is saved on your classpath somewhere at {{<TRIG_FILE>}} + +Load data through API: +{code} +InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream("namedgraphs.trig"); +RepositoryConnection conn = repository.getConnection(); +conn.add(stream, "", RDFFormat.TRIG); +conn.commit(); +{code} + +Now that the data is loaded we can easily query it. For example, we will query to find what {{hasSkill}} is defined in graph G2, and relate that to someone defined in G1. + +Query: +{code} +PREFIX ex: <http://www.example.org/exampleDocument#> +PREFIX voc: <http://www.example.org/vocabulary#> +PREFIX foaf: <http://xmlns.com/foaf/0.1/> +PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> + +SELECT * +WHERE +{ + GRAPH ex:G1 + { + ?m voc:name ?name ; + voc:homepage ?hp . + } . + GRAPH ex:G2 + { + ?m voc:hasSkill ?skill . + } . +} +{code} + +Results: +{code} +[hp=http://www.monicamurphy.org;m=http://www.example.org/exampleDocument#Monica;skill=http://www.example.org/vocabulary#Programming;name="Monica Murphy"] +{code} + +Here is the Query Plan as well: +{code} +QueryRoot + Projection + ProjectionElemList + ProjectionElem "m" + ProjectionElem "name" + ProjectionElem "hp" + ProjectionElem "skill" + Join + Join + StatementPattern FROM NAMED CONTEXT + Var (name=m) + Var (name=-const-2, value=http://www.example.org/vocabulary#name, anonymous) + Var (name=name) + Var (name=-const-1, value=http://www.example.org/exampleDocument#G1, anonymous) + StatementPattern FROM NAMED CONTEXT + Var (name=m) + Var (name=-const-3, value=http://www.example.org/vocabulary#homepage, anonymous) + Var (name=hp) + Var (name=-const-1, value=http://www.example.org/exampleDocument#G1, anonymous) + StatementPattern FROM NAMED CONTEXT + Var (name=m) + Var (name=-const-5, value=http://www.example.org/vocabulary#hasSkill, anonymous) + Var (name=skill) + Var (name=-const-4, value=http://www.example.org/exampleDocument#G2, anonymous) +{code} + +h2. Inserting named graph data through Sparql + +The new Sparql update standard provides another way to insert data, even into named graphs. + +First the insert update: +{code} +PREFIX dc: <http://purl.org/dc/elements/1.1/> +PREFIX ex: <http://example/addresses#> +INSERT DATA +{ + GRAPH ex:G1 { + <http://example/book3> dc:title "A new book" ; + dc:creator "A.N.Other" . + } +} +{code} + +To perform this update, it requires different code than querying the data directly: +{code} +Update update = conn.prepareUpdate(QueryLanguage.SPARQL, insert); +update.execute(); +{code} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.manual/src/main/webapp/sm-simpleaqr.conf ---------------------------------------------------------------------- diff --git a/extras/rya.manual/src/main/webapp/sm-simpleaqr.conf b/extras/rya.manual/src/main/webapp/sm-simpleaqr.conf new file mode 100644 index 0000000..7616197 --- /dev/null +++ b/extras/rya.manual/src/main/webapp/sm-simpleaqr.conf @@ -0,0 +1,54 @@ +h1. Simple Add Query and Remove of Statements + +This quick tutorial will give a small example on how to add, query, and remove statements from Cloudbase + +h2. Code + +{code} +//setup +Connector connector = new ZooKeeperInstance("cbinstance", "zkserver:port").getConnector("cbuser", "cbpassword"); +final RdfCloudTripleStore store = new RdfCloudTripleStore(); +AccumuloRyaDAO crdfdao = new AccumuloRyaDAO(); +crdfdao.setConnector(connector); + +AccumuloRdfConfiguration conf = new AccumuloRdfConfiguration(); +conf.setTablePrefix("rts_"); +conf.setDisplayQueryPlan(true); +crdfdao.setConf(conf); +store.setRdfDao(crdfdao); + +ProspectorServiceEvalStatsDAO evalDao = new ProspectorServiceEvalStatsDAO(connector, conf); +evalDao.init(); +store.setRdfEvalStatsDAO(evalDao); + +InferenceEngine inferenceEngine = new InferenceEngine(); +inferenceEngine.setRdfDao(crdfdao); +inferenceEngine.setConf(conf); +store.setInferenceEngine(inferenceEngine); + +Repository myRepository = new RyaSailRepository(store); +myRepository.initialize(); +RepositoryConnection conn = myRepository.getConnection(); + +//define and add statement +String litdupsNS = "urn:test:litdups#"; +URI cpu = vf.createURI(litdupsNS, "cpu"); +URI loadPerc = vf.createURI(litdupsNS, "loadPerc"); +URI uri1 = vf.createURI(litdupsNS, "uri1"); +conn.add(cpu, loadPerc, uri1); +conn.commit(); + +//query for all statements that have subject=cpu and pred=loadPerc (wildcard object) +RepositoryResult<Statement> result = conn.getStatements(cpu, loadPerc, null, true) +while(result.hasNext()) { + System.out.println(result.next()); +} +result.close(); + +//remove statement +conn.remove(cpu, loadPerc, uri1); + +//close +conn.close(); +myRepository.shutDown(); +{code} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.manual/src/main/webapp/sm-sparqlquery.conf ---------------------------------------------------------------------- diff --git a/extras/rya.manual/src/main/webapp/sm-sparqlquery.conf b/extras/rya.manual/src/main/webapp/sm-sparqlquery.conf new file mode 100644 index 0000000..985e21a --- /dev/null +++ b/extras/rya.manual/src/main/webapp/sm-sparqlquery.conf @@ -0,0 +1,58 @@ +h1. Simple Add Query and Remove of Statements + +This quick tutorial will give a small example on how to query data with Sparql + +h2. Code + +{code} +//setup +Connector connector = new ZooKeeperInstance("cbinstance", "zkserver:port").getConnector("cbuser", "cbpassword"); +final RdfCloudTripleStore store = new RdfCloudTripleStore(); +AccumuloRyaDAO crdfdao = new AccumuloRyaDAO(); +crdfdao.setConnector(connector); + +AccumuloRdfConfiguration conf = new AccumuloRdfConfiguration(); +conf.setTablePrefix("rts_"); +conf.setDisplayQueryPlan(true); +crdfdao.setConf(conf); +store.setRdfDao(crdfdao); + +ProspectorServiceEvalStatsDAO evalDao = new ProspectorServiceEvalStatsDAO(connector, conf); +evalDao.init(); +store.setRdfEvalStatsDAO(evalDao); + +InferenceEngine inferenceEngine = new InferenceEngine(); +inferenceEngine.setRdfDao(crdfdao); +inferenceEngine.setConf(conf); +store.setInferenceEngine(inferenceEngine); + +Repository myRepository = new RyaSailRepository(store); +myRepository.initialize(); +RepositoryConnection conn = myRepository.getConnection(); + +//define and add statements +String litdupsNS = "urn:test:litdups#"; +URI cpu = vf.createURI(litdupsNS, "cpu"); +URI loadPerc = vf.createURI(litdupsNS, "loadPerc"); +URI uri1 = vf.createURI(litdupsNS, "uri1"); +URI pred2 = vf.createURI(litdupsNS, "pred2"); +URI uri2 = vf.createURI(litdupsNS, "uri2"); +conn.add(cpu, loadPerc, uri1); +conn.commit(); + +//query using sparql +String query = "select * where {" + + "?x <" + loadPerc.stringValue() + "> ?o1." + + "?x <" + pred2.stringValue() + "> ?o2." + + "}"; +TupleQuery tupleQuery = conn.prepareTupleQuery(QueryLanguage.SPARQL, query); +TupleQueryResult result = tupleQuery.evaluate(); +while(result.hasNext()) { + System.out.println(result.next()); +} +result.close(); + +//close +conn.close(); +myRepository.shutDown(); +{code} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.manual/src/main/webapp/sm-updatedata.conf ---------------------------------------------------------------------- diff --git a/extras/rya.manual/src/main/webapp/sm-updatedata.conf b/extras/rya.manual/src/main/webapp/sm-updatedata.conf new file mode 100644 index 0000000..b3e3a5b --- /dev/null +++ b/extras/rya.manual/src/main/webapp/sm-updatedata.conf @@ -0,0 +1,56 @@ +h1. Sparql Update + +OpenRDF supports the Sparql Update functionality. Here are a few samples: + +Remember, you have to use {{RepositoryConnection.prepareUpdate(..)}} to perform these queries + +Insert: +{code} +PREFIX dc: <http://purl.org/dc/elements/1.1/> +INSERT DATA +{ <http://example/book3> dc:title "A new book" ; + dc:creator "A.N.Other" . +} +{code} + +Delete: +{code} +PREFIX dc: <http://purl.org/dc/elements/1.1/> +DELETE DATA +{ <http://example/book3> dc:title "A new book" ; + dc:creator "A.N.Other" . +} +{code} + +Update: +{code} +PREFIX dc: <http://purl.org/dc/elements/1.1/> +DELETE { ?book dc:title ?title } +INSERT { ?book dc:title "A newer book". ?book dc:add "Additional Info" } +WHERE + { ?book dc:creator "A.N.Other" . + } +{code} + +Insert Named Graph: +{code} +PREFIX dc: <http://purl.org/dc/elements/1.1/> +PREFIX ex: <http://example/addresses#> +INSERT DATA +{ GRAPH ex:G1 { +<http://example/book3> dc:title "A new book" ; + dc:creator "A.N.Other" . +} +} +{code} + +Update Named Graph: +{code} +PREFIX dc: <http://purl.org/dc/elements/1.1/> +WITH <http://example/addresses#G1> +DELETE { ?book dc:title ?title } +INSERT { ?book dc:title "A newer book". ?book dc:add "Additional Info" } +WHERE + { ?book dc:creator "A.N.Other" . + } +{code} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.prospector/pom.xml ---------------------------------------------------------------------- diff --git a/extras/rya.prospector/pom.xml b/extras/rya.prospector/pom.xml new file mode 100644 index 0000000..ddb28dd --- /dev/null +++ b/extras/rya.prospector/pom.xml @@ -0,0 +1,131 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>mvm.rya</groupId> + <artifactId>rya.extras</artifactId> + <version>3.2.9</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <name>${project.groupId}.${project.artifactId}</name> + <artifactId>rya.prospector</artifactId> + <dependencies> + <dependency> + <groupId>mvm.rya</groupId> + <artifactId>rya.api</artifactId> + <exclusions> + <exclusion> + <artifactId>mockito-all</artifactId> + <groupId>org.mockito</groupId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>commons-lang</groupId> + <artifactId>commons-lang</artifactId> + </dependency> + <dependency> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + </dependency> + <dependency> + <groupId>mvm.rya</groupId> + <artifactId>accumulo.rya</artifactId> + </dependency> + <dependency> + <groupId>org.codehaus.groovy</groupId> + <artifactId>groovy-all</artifactId> + </dependency> + <dependency> + <groupId>org.apache.mrunit</groupId> + <artifactId>mrunit</artifactId> + <version>1.1.0</version> + <classifier>hadoop2</classifier> + <scope>test</scope> + </dependency> + + </dependencies> + <build> + <plugins> +<plugin> +<artifactId>maven-compiler-plugin</artifactId> +<!-- 2.8.0-01 and later require maven-compiler-plugin 3.1 or higher --> +<version>3.1</version> +<configuration> +<compilerId>groovy-eclipse-compiler</compilerId> +</configuration> +<dependencies> +<dependency> +<groupId>org.codehaus.groovy</groupId> +<artifactId>groovy-eclipse-compiler</artifactId> +<version>2.9.1-01</version> +</dependency> +<!-- for 2.8.0-01 and later you must have an explicit dependency on groovy-eclipse-batch --> +<dependency> +<groupId>org.codehaus.groovy</groupId> +<artifactId>groovy-eclipse-batch</artifactId> +<version>2.3.7-01</version> +</dependency> +</dependencies> +</plugin> +<plugin> +<groupId>org.codehaus.groovy</groupId> +<artifactId>groovy-eclipse-compiler</artifactId> +<version>2.9.1-01</version> +<extensions>true</extensions> +</plugin> +<plugin> +<groupId>org.apache.maven.plugins</groupId> +<artifactId>maven-shade-plugin</artifactId> +<executions> +<execution> +<configuration> +<shadedArtifactAttached>true</shadedArtifactAttached> +<shadedClassifierName>map-reduce</shadedClassifierName> +<transformers> +<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" /> +</transformers> +</configuration> +</execution> +</executions> +</plugin> +</plugins> + </build> + <profiles> + <profile> + <id>accumulo</id> + <activation> + <activeByDefault>true</activeByDefault> + </activation> + <dependencies> + <dependency> + <groupId>org.apache.accumulo</groupId> + <artifactId>accumulo-core</artifactId> + <optional>true</optional> + </dependency> + <dependency> + <groupId>mvm.rya</groupId> + <artifactId>accumulo.iterators</artifactId> + <optional>true</optional> + </dependency> + </dependencies> + </profile> + <profile> + <id>cloudbase</id> + <activation> + <activeByDefault>false</activeByDefault> + </activation> + <dependencies> + <dependency> + <groupId>com.texeltek</groupId> + <artifactId>accumulo-cloudbase-shim</artifactId> + <optional>true</optional> + </dependency> + <dependency> + <groupId>mvm.rya</groupId> + <artifactId>cloudbase.iterators</artifactId> + <optional>true</optional> + </dependency> + </dependencies> + </profile> + </profiles> +</project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/domain/IndexEntry.groovy ---------------------------------------------------------------------- diff --git a/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/domain/IndexEntry.groovy b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/domain/IndexEntry.groovy new file mode 100644 index 0000000..ae08089 --- /dev/null +++ b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/domain/IndexEntry.groovy @@ -0,0 +1,57 @@ +package mvm.rya.prospector.domain + +/** + * Date: 12/5/12 + * Time: 11:33 AM + */ +class IndexEntry { + def String index + def String data + def String dataType + def String tripleValueType + def String visibility + def Long count + def Long timestamp + + @Override + public String toString() { + return "IndexEntry{" + + "index='" + index + '\'' + + ", data='" + data + '\'' + + ", dataType='" + dataType + '\'' + + ", tripleValueType=" + tripleValueType + + ", visibility='" + visibility + '\'' + + ", timestamp='" + timestamp + '\'' + + ", count=" + count + + '}'; + } + + boolean equals(o) { + if (this.is(o)) return true + if (getClass() != o.class) return false + + IndexEntry that = (IndexEntry) o + + if (count != that.count) return false + if (timestamp != that.timestamp) return false + if (data != that.data) return false + if (dataType != that.dataType) return false + if (index != that.index) return false + if (tripleValueType != that.tripleValueType) return false + if (visibility != that.visibility) return false + + return true + } + + int hashCode() { + int result + result = (index != null ? index.hashCode() : 0) + result = 31 * result + (data != null ? data.hashCode() : 0) + result = 31 * result + (dataType != null ? dataType.hashCode() : 0) + result = 31 * result + (tripleValueType != null ? tripleValueType.hashCode() : 0) + result = 31 * result + (visibility != null ? visibility.hashCode() : 0) + result = 31 * result + (count != null ? count.hashCode() : 0) + result = 31 * result + (timestamp != null ? timestamp.hashCode() : 0) + return result + } +} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/domain/IntermediateProspect.groovy ---------------------------------------------------------------------- diff --git a/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/domain/IntermediateProspect.groovy b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/domain/IntermediateProspect.groovy new file mode 100644 index 0000000..f80ac93 --- /dev/null +++ b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/domain/IntermediateProspect.groovy @@ -0,0 +1,51 @@ +package mvm.rya.prospector.domain + +import org.apache.hadoop.io.WritableComparable + +import static mvm.rya.prospector.domain.TripleValueType.* + +/** + * Date: 12/3/12 + * Time: 11:15 AM + */ +class IntermediateProspect implements WritableComparable<IntermediateProspect> { + + def String index + def String data + def String dataType + def TripleValueType tripleValueType + def String visibility + + @Override + int compareTo(IntermediateProspect t) { + if(!index.equals(t.index)) + return index.compareTo(t.index); + if(!data.equals(t.data)) + return data.compareTo(t.data); + if(!dataType.equals(t.dataType)) + return dataType.compareTo(t.dataType); + if(!tripleValueType.equals(t.tripleValueType)) + return tripleValueType.compareTo(t.tripleValueType); + if(!visibility.equals(t.visibility)) + return visibility.compareTo(t.visibility); + return 0 + } + + @Override + void write(DataOutput dataOutput) { + dataOutput.writeUTF(index); + dataOutput.writeUTF(data); + dataOutput.writeUTF(dataType); + dataOutput.writeUTF(tripleValueType.name()); + dataOutput.writeUTF(visibility); + } + + @Override + void readFields(DataInput dataInput) { + index = dataInput.readUTF() + data = dataInput.readUTF() + dataType = dataInput.readUTF() + tripleValueType = TripleValueType.valueOf(dataInput.readUTF()) + visibility = dataInput.readUTF() + } +} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/domain/TripleValueType.java ---------------------------------------------------------------------- diff --git a/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/domain/TripleValueType.java b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/domain/TripleValueType.java new file mode 100644 index 0000000..c2f0a0e --- /dev/null +++ b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/domain/TripleValueType.java @@ -0,0 +1,6 @@ +package mvm.rya.prospector.domain; + +public enum TripleValueType { + + subject, predicate, object, entity, subjectpredicate, predicateobject, subjectobject +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/mr/Prospector.groovy ---------------------------------------------------------------------- diff --git a/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/mr/Prospector.groovy b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/mr/Prospector.groovy new file mode 100644 index 0000000..54ffcf9 --- /dev/null +++ b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/mr/Prospector.groovy @@ -0,0 +1,89 @@ +package mvm.rya.prospector.mr + +import mvm.rya.prospector.utils.ProspectorUtils +import org.apache.accumulo.core.data.Mutation +import org.apache.accumulo.core.data.Value +import org.apache.accumulo.core.security.ColumnVisibility +import org.apache.hadoop.conf.Configured +import org.apache.hadoop.util.Tool +import org.apache.hadoop.util.ToolRunner +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapreduce.Job + +import org.apache.hadoop.io.LongWritable +import org.apache.commons.lang.time.DateUtils + +import mvm.rya.prospector.domain.IntermediateProspect + +import com.google.common.collect.Lists + +import static mvm.rya.prospector.utils.ProspectorConstants.* +import static mvm.rya.prospector.utils.ProspectorUtils.* + +/** + * Date: 12/3/12 + * Time: 10:57 AM + */ +class Prospector extends Configured implements Tool { + + private static long NOW = System.currentTimeMillis(); + + private Date truncatedDate; + + public static void main(String[] args) { + int res = ToolRunner.run(new Prospector(), args); + System.exit(res); + } + + @Override + int run(String[] args) { + Configuration conf = getConf(); + + truncatedDate = DateUtils.truncate(new Date(NOW), Calendar.MINUTE); + + Path configurationPath = new Path(args[0]); + conf.addResource(configurationPath); + + def inTable = conf.get("prospector.intable") + def outTable = conf.get("prospector.outtable") + def auths_str = conf.get("prospector.auths") + assert inTable != null + assert outTable != null + assert auths_str != null + + Job job = new Job(getConf(), this.getClass().getSimpleName() + "_" + System.currentTimeMillis()); + job.setJarByClass(this.getClass()); + + String[] auths = auths_str.split(",") + ProspectorUtils.initMRJob(job, inTable, outTable, auths) + + job.getConfiguration().setLong("DATE", NOW); + + def performant = conf.get(PERFORMANT) + if (Boolean.parseBoolean(performant)) { + /** + * Apply some performance tuning + */ + ProspectorUtils.addMRPerformance(job.configuration) + } + + job.setMapOutputKeyClass(IntermediateProspect.class); + job.setMapOutputValueClass(LongWritable.class); + + job.setMapperClass(ProspectorMapper.class); + job.setCombinerClass(ProspectorCombiner.class); + job.setReducerClass(ProspectorReducer.class); + job.waitForCompletion(true); + + int success = job.isSuccessful() ? 0 : 1; + + if (success == 0) { + Mutation m = new Mutation(METADATA) + m.put(PROSPECT_TIME, getReverseIndexDateTime(truncatedDate), new ColumnVisibility(DEFAULT_VIS), truncatedDate.time, new Value(EMPTY)) + writeMutations(connector(instance(conf), conf), outTable, [m]) + } + + return success + } +} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/mr/ProspectorCombiner.groovy ---------------------------------------------------------------------- diff --git a/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/mr/ProspectorCombiner.groovy b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/mr/ProspectorCombiner.groovy new file mode 100644 index 0000000..65ec71b --- /dev/null +++ b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/mr/ProspectorCombiner.groovy @@ -0,0 +1,42 @@ +package mvm.rya.prospector.mr + +import mvm.rya.prospector.plans.IndexWorkPlan +import mvm.rya.prospector.plans.IndexWorkPlanManager +import mvm.rya.prospector.plans.impl.ServicesBackedIndexWorkPlanManager +import org.apache.commons.lang.time.DateUtils +import org.apache.hadoop.mapreduce.Reducer +import mvm.rya.prospector.utils.ProspectorUtils + +/** + * Date: 12/3/12 + * Time: 11:06 AM + */ +class ProspectorCombiner extends Reducer { + + private Date truncatedDate; + private IndexWorkPlanManager manager = new ServicesBackedIndexWorkPlanManager() + Map<String, IndexWorkPlan> plans + + @Override + public void setup(Reducer.Context context) throws IOException, InterruptedException { + super.setup(context); + + long now = context.getConfiguration().getLong("DATE", System.currentTimeMillis()); + truncatedDate = DateUtils.truncate(new Date(now), Calendar.MINUTE); + + this.plans = ProspectorUtils.planMap(manager.plans) + } + + @Override + protected void reduce(def prospect, Iterable values, Reducer.Context context) { + def plan = plans.get(prospect.index) + if (plan != null) { + def coll = plan.combine(prospect, values) + if (coll != null) { + coll.each { entry -> + context.write(entry.key, entry.value) + } + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/mr/ProspectorMapper.groovy ---------------------------------------------------------------------- diff --git a/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/mr/ProspectorMapper.groovy b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/mr/ProspectorMapper.groovy new file mode 100644 index 0000000..1eef226 --- /dev/null +++ b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/mr/ProspectorMapper.groovy @@ -0,0 +1,56 @@ +package mvm.rya.prospector.mr + +import mvm.rya.accumulo.AccumuloRdfConfiguration +import mvm.rya.api.RdfCloudTripleStoreConstants +import mvm.rya.api.domain.RyaStatement +import mvm.rya.api.resolver.RyaTripleContext +import mvm.rya.api.resolver.triple.TripleRow +import mvm.rya.prospector.plans.IndexWorkPlan +import mvm.rya.prospector.plans.IndexWorkPlanManager +import mvm.rya.prospector.plans.impl.ServicesBackedIndexWorkPlanManager + +import org.apache.commons.lang.time.DateUtils +import org.apache.hadoop.mapreduce.Mapper + +/** + * Date: 12/3/12 + * Time: 11:06 AM + */ +class ProspectorMapper extends Mapper { + + private Date truncatedDate; + private RyaTripleContext ryaContext; + private IndexWorkPlanManager manager = new ServicesBackedIndexWorkPlanManager() + private Collection<IndexWorkPlan> plans = manager.plans + + @Override + public void setup(Mapper.Context context) throws IOException, InterruptedException { + super.setup(context); + + long now = context.getConfiguration().getLong("DATE", System.currentTimeMillis()); + ryaContext = RyaTripleContext.getInstance(new AccumuloRdfConfiguration(context.getConfiguration())); + truncatedDate = DateUtils.truncate(new Date(now), Calendar.MINUTE); + } + + @Override + public void map(def row, def data, Mapper.Context context) { + RyaStatement ryaStatement = ryaContext.deserializeTriple(RdfCloudTripleStoreConstants.TABLE_LAYOUT.SPO, + new TripleRow( + row.row.bytes, + row.columnFamily.bytes, + row.columnQualifier.bytes, + row.timestamp, + row.columnVisibility.bytes, + data.get() + ) + ) + plans.each { plan -> + def coll = plan.map(ryaStatement) + if (coll != null) { + coll.each { entry -> + context.write(entry.key, entry.value) + } + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/mr/ProspectorReducer.groovy ---------------------------------------------------------------------- diff --git a/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/mr/ProspectorReducer.groovy b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/mr/ProspectorReducer.groovy new file mode 100644 index 0000000..8beabcf --- /dev/null +++ b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/mr/ProspectorReducer.groovy @@ -0,0 +1,38 @@ +package mvm.rya.prospector.mr + +import mvm.rya.prospector.plans.IndexWorkPlan +import mvm.rya.prospector.plans.IndexWorkPlanManager +import mvm.rya.prospector.plans.impl.ServicesBackedIndexWorkPlanManager +import org.apache.commons.lang.time.DateUtils +import org.apache.hadoop.mapreduce.Reducer +import mvm.rya.prospector.utils.ProspectorUtils + +/** + * Date: 12/3/12 + * Time: 11:06 AM + */ +class ProspectorReducer extends Reducer { + + private Date truncatedDate; + private IndexWorkPlanManager manager = new ServicesBackedIndexWorkPlanManager() + Map<String, IndexWorkPlan> plans + + @Override + public void setup(Reducer.Context context) throws IOException, InterruptedException { + super.setup(context); + + def conf = context.getConfiguration() + long now = conf.getLong("DATE", System.currentTimeMillis()); + truncatedDate = DateUtils.truncate(new Date(now), Calendar.MINUTE); + + this.plans = ProspectorUtils.planMap(manager.plans) + } + + @Override + protected void reduce(def prospect, Iterable values, Reducer.Context context) { + def plan = plans.get(prospect.index) + if (plan != null) { + plan.reduce(prospect, values, truncatedDate, context) + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/plans/IndexWorkPlan.groovy ---------------------------------------------------------------------- diff --git a/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/plans/IndexWorkPlan.groovy b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/plans/IndexWorkPlan.groovy new file mode 100644 index 0000000..0e53f6d --- /dev/null +++ b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/plans/IndexWorkPlan.groovy @@ -0,0 +1,32 @@ +package mvm.rya.prospector.plans + +import mvm.rya.api.domain.RyaStatement +import mvm.rya.prospector.domain.IntermediateProspect +import org.apache.hadoop.io.LongWritable +import org.apache.hadoop.mapreduce.Reducer +import org.openrdf.model.vocabulary.XMLSchema +import mvm.rya.prospector.domain.IndexEntry + +/** + * Date: 12/3/12 + * Time: 11:12 AM + */ +public interface IndexWorkPlan { + + public static final String URITYPE = XMLSchema.ANYURI.stringValue() + public static final LongWritable ONE = new LongWritable(1) + public static final String DELIM = "\u0000"; + + public Collection<Map.Entry<IntermediateProspect, LongWritable>> map(RyaStatement ryaStatement) + + public Collection<Map.Entry<IntermediateProspect, LongWritable>> combine(IntermediateProspect prospect, Iterable<LongWritable> counts); + + public void reduce(IntermediateProspect prospect, Iterable<LongWritable> counts, Date timestamp, Reducer.Context context) + + public String getIndexType() + + public String getCompositeValue(List<String> indices) + + public List<IndexEntry> query(def connector, String tableName, List<Long> prospectTimes, String type, String index, String dataType, String[] auths) + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/plans/IndexWorkPlanManager.groovy ---------------------------------------------------------------------- diff --git a/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/plans/IndexWorkPlanManager.groovy b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/plans/IndexWorkPlanManager.groovy new file mode 100644 index 0000000..915b66a --- /dev/null +++ b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/plans/IndexWorkPlanManager.groovy @@ -0,0 +1,10 @@ +package mvm.rya.prospector.plans + +/** + * Date: 12/3/12 + * Time: 11:24 AM + */ +public interface IndexWorkPlanManager { + + public Collection<IndexWorkPlan> getPlans(); +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/plans/impl/CountPlan.groovy ---------------------------------------------------------------------- diff --git a/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/plans/impl/CountPlan.groovy b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/plans/impl/CountPlan.groovy new file mode 100644 index 0000000..ae64e3b --- /dev/null +++ b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/plans/impl/CountPlan.groovy @@ -0,0 +1,201 @@ +package mvm.rya.prospector.plans.impl + +import mvm.rya.api.domain.RyaStatement +import mvm.rya.prospector.domain.IndexEntry +import mvm.rya.prospector.domain.IntermediateProspect +import mvm.rya.prospector.domain.TripleValueType +import mvm.rya.prospector.plans.IndexWorkPlan +import mvm.rya.prospector.utils.CustomEntry +import mvm.rya.prospector.utils.ProspectorUtils + +import org.apache.accumulo.core.data.Mutation +import org.apache.accumulo.core.data.Range +import org.apache.accumulo.core.data.Value +import org.apache.accumulo.core.security.Authorizations +import org.apache.accumulo.core.security.ColumnVisibility +import org.apache.hadoop.io.LongWritable +import org.apache.hadoop.io.Text +import org.apache.hadoop.mapreduce.Reducer +import org.openrdf.model.util.URIUtil +import org.openrdf.model.vocabulary.XMLSchema; + +import static mvm.rya.prospector.utils.ProspectorConstants.COUNT; +import mvm.rya.api.RdfCloudTripleStoreConstants + +/** + * Date: 12/3/12 + * Time: 12:28 PM + */ +class CountPlan implements IndexWorkPlan { + + @Override + Collection<Map.Entry<IntermediateProspect, LongWritable>> map(RyaStatement ryaStatement) { + def subject = ryaStatement.getSubject() + def predicate = ryaStatement.getPredicate() + def subjpred = ryaStatement.getSubject().data + DELIM + ryaStatement.getPredicate().data + def predobj = ryaStatement.getPredicate().data + DELIM + ryaStatement.getObject().data + def subjobj = ryaStatement.getSubject().data + DELIM + ryaStatement.getObject().data + def object = ryaStatement.getObject() + def localIndex = URIUtil.getLocalNameIndex(subject.data) + def namespace = subject.data.substring(0, localIndex - 1) + def visibility = new String(ryaStatement.columnVisibility) + return [ + new CustomEntry<IntermediateProspect, LongWritable>( + new IntermediateProspect(index: COUNT, + data: subject.data, + dataType: URITYPE, + tripleValueType: TripleValueType.subject, + visibility: visibility), + ONE), + new CustomEntry<IntermediateProspect, LongWritable>( + new IntermediateProspect(index: COUNT, + data: predicate.data, + dataType: URITYPE, + tripleValueType: TripleValueType.predicate, + visibility: visibility + ), ONE), + new CustomEntry<IntermediateProspect, LongWritable>( + new IntermediateProspect(index: COUNT, + data: object.data, + dataType: object.dataType.stringValue(), + tripleValueType: TripleValueType.object, + visibility: visibility + ), ONE), + new CustomEntry<IntermediateProspect, LongWritable>( + new IntermediateProspect(index: COUNT, + data: subjpred, + dataType: XMLSchema.STRING, + tripleValueType: TripleValueType.subjectpredicate, + visibility: visibility + ), ONE), + new CustomEntry<IntermediateProspect, LongWritable>( + new IntermediateProspect(index: COUNT, + data: subjobj, + dataType: XMLSchema.STRING, + tripleValueType: TripleValueType.subjectobject, + visibility: visibility + ), ONE), + new CustomEntry<IntermediateProspect, LongWritable>( + new IntermediateProspect(index: COUNT, + data: predobj, + dataType: XMLSchema.STRING, + tripleValueType: TripleValueType.predicateobject, + visibility: visibility + ), ONE), + new CustomEntry<IntermediateProspect, LongWritable>( + new IntermediateProspect(index: COUNT, + data: namespace, + dataType: URITYPE, + tripleValueType: TripleValueType.entity, + visibility: visibility + ), ONE), + ] + } + + @Override + Collection<Map.Entry<IntermediateProspect, LongWritable>> combine(IntermediateProspect prospect, Iterable<LongWritable> counts) { + + def iter = counts.iterator() + long sum = 0; + iter.each { lw -> + sum += lw.get() + } + + return [new CustomEntry<IntermediateProspect, LongWritable>(prospect, new LongWritable(sum))] + } + + @Override + void reduce(IntermediateProspect prospect, Iterable<LongWritable> counts, Date timestamp, Reducer.Context context) { + def iter = counts.iterator() + long sum = 0; + iter.each { lw -> + sum += lw.get() + } + + def indexType = prospect.tripleValueType.name() + + // not sure if this is the best idea.. + if ((sum >= 0) || + indexType.equals(TripleValueType.predicate.toString())) { + + Mutation m = new Mutation(indexType + DELIM + prospect.data + DELIM + ProspectorUtils.getReverseIndexDateTime(timestamp)) + m.put(COUNT, prospect.dataType, new ColumnVisibility(prospect.visibility), timestamp.getTime(), new Value("${sum}".getBytes())); + + context.write(null, m); + } + } + + @Override + String getIndexType() { + return COUNT + } + + @Override + String getCompositeValue(List<String> indices){ + Iterator<String> indexIt = indices.iterator(); + String compositeIndex = indexIt.next(); + while (indexIt.hasNext()){ + String value = indexIt.next(); + compositeIndex += DELIM + value; + } + return compositeIndex; + } + + @Override + List<IndexEntry> query(def connector, String tableName, List<Long> prospectTimes, String type, String compositeIndex, String dataType, String[] auths) { + + assert connector != null && tableName != null && type != null && compositeIndex != null + + def bs = connector.createBatchScanner(tableName, new Authorizations(auths), 4) + def ranges = [] + int max = 1000; //by default only return 1000 prospects maximum + if (prospectTimes != null) { + prospectTimes.each { prospect -> + ranges.add( + new Range(type + DELIM + compositeIndex + DELIM + ProspectorUtils.getReverseIndexDateTime(new Date(prospect)))) + } + } else { + max = 1; //only return the latest if no prospectTimes given + def prefix = type + DELIM + compositeIndex + DELIM; + ranges.add(new Range(prefix, prefix + RdfCloudTripleStoreConstants.LAST)) + } + bs.ranges = ranges + if (dataType != null) { + bs.fetchColumn(new Text(COUNT), new Text(dataType)) + } else { + bs.fetchColumnFamily(new Text(COUNT)) + } + + List<IndexEntry> indexEntries = new ArrayList<IndexEntry>() + def iter = bs.iterator() + + while (iter.hasNext() && indexEntries.size() <= max) { + def entry = iter.next() + def k = entry.key + def v = entry.value + + def rowArr = k.row.toString().split(DELIM) + String values = ""; + // if it is a composite index, then return the type as a composite index + if (type.equalsIgnoreCase(TripleValueType.subjectpredicate.toString()) || + type.equalsIgnoreCase(TripleValueType.subjectobject.toString()) || + type.equalsIgnoreCase(TripleValueType.predicateobject.toString())){ + values =rowArr[1] + DELIM + rowArr[2] + } + else values = rowArr[1] + + indexEntries.add(new IndexEntry(data: values, + tripleValueType: rowArr[0], + index: COUNT, + dataType: k.columnQualifier.toString(), + visibility: k.columnVisibility.toString(), + count: Long.parseLong(new String(v.get())), + timestamp: k.timestamp + )) + } + bs.close() + + return indexEntries + } + +} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/plans/impl/ServicesBackedIndexWorkPlanManager.groovy ---------------------------------------------------------------------- diff --git a/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/plans/impl/ServicesBackedIndexWorkPlanManager.groovy b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/plans/impl/ServicesBackedIndexWorkPlanManager.groovy new file mode 100644 index 0000000..931f6a7 --- /dev/null +++ b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/plans/impl/ServicesBackedIndexWorkPlanManager.groovy @@ -0,0 +1,19 @@ +package mvm.rya.prospector.plans.impl + +import mvm.rya.prospector.plans.IndexWorkPlan +import com.google.common.collect.Lists +import mvm.rya.prospector.plans.IndexWorkPlanManager + +/** + * Date: 12/3/12 + * Time: 11:24 AM + */ +class ServicesBackedIndexWorkPlanManager implements IndexWorkPlanManager { + + def Collection<IndexWorkPlan> plans + + ServicesBackedIndexWorkPlanManager() { + def iterator = ServiceLoader.load(IndexWorkPlan.class).iterator(); + plans = Lists.newArrayList(iterator) + } +} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/service/ProspectorService.groovy ---------------------------------------------------------------------- diff --git a/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/service/ProspectorService.groovy b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/service/ProspectorService.groovy new file mode 100644 index 0000000..eb3a975 --- /dev/null +++ b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/service/ProspectorService.groovy @@ -0,0 +1,107 @@ +package mvm.rya.prospector.service + +import mvm.rya.prospector.utils.ProspectorUtils +import org.apache.accumulo.core.data.Key +import org.apache.accumulo.core.data.Range +import org.apache.accumulo.core.security.Authorizations +import org.apache.hadoop.io.Text + +import static mvm.rya.prospector.utils.ProspectorConstants.METADATA +import static mvm.rya.prospector.utils.ProspectorConstants.PROSPECT_TIME +import mvm.rya.prospector.plans.IndexWorkPlanManager +import mvm.rya.prospector.plans.impl.ServicesBackedIndexWorkPlanManager +import mvm.rya.prospector.plans.IndexWorkPlan +import mvm.rya.prospector.domain.IndexEntry + +/** + * Date: 12/5/12 + * Time: 12:28 PM + */ +class ProspectorService { + + def connector + String tableName + + IndexWorkPlanManager manager = new ServicesBackedIndexWorkPlanManager() + Map<String, IndexWorkPlan> plans + + ProspectorService(def connector, String tableName) { + this.connector = connector + this.tableName = tableName + this.plans = ProspectorUtils.planMap(manager.plans) + + //init + def tos = connector.tableOperations() + if(!tos.exists(tableName)) { + tos.create(tableName) + } + } + + public Iterator<Long> getProspects(String[] auths) { + + def scanner = connector.createScanner(tableName, new Authorizations(auths)) + scanner.setRange(Range.exact(METADATA)); + scanner.fetchColumnFamily(new Text(PROSPECT_TIME)); + + def iterator = scanner.iterator(); + + return new Iterator<Long>() { + + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public Long next() { + return iterator.next().getKey().getTimestamp(); + } + + @Override + public void remove() { + iterator.remove(); + } + }; + + } + + public Iterator<Long> getProspectsInRange(long beginTime, long endTime, String[] auths) { + + def scanner = connector.createScanner(tableName, new Authorizations(auths)) + scanner.setRange(new Range( + new Key(METADATA, PROSPECT_TIME, ProspectorUtils.getReverseIndexDateTime(new Date(endTime)), "", Long.MAX_VALUE), + new Key(METADATA, PROSPECT_TIME, ProspectorUtils.getReverseIndexDateTime(new Date(beginTime)), "", 0l) + )) + def iterator = scanner.iterator(); + + return new Iterator<Long>() { + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public Long next() { + return iterator.next().getKey().getTimestamp(); + } + + @Override + public void remove() { + iterator.remove(); + } + }; + + } + + public List<IndexEntry> query(List<Long> prospectTimes, String indexType, String type, List<String> index, String dataType, String[] auths) { + assert indexType != null + + def plan = plans.get(indexType) + assert plan != null: "Index Type: ${indexType} does not exist" + String compositeIndex = plan.getCompositeValue(index); + + return plan.query(connector, tableName, prospectTimes, type, compositeIndex, dataType, auths) + } +} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/service/ProspectorServiceEvalStatsDAO.groovy ---------------------------------------------------------------------- diff --git a/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/service/ProspectorServiceEvalStatsDAO.groovy b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/service/ProspectorServiceEvalStatsDAO.groovy new file mode 100644 index 0000000..4e9c3d1 --- /dev/null +++ b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/service/ProspectorServiceEvalStatsDAO.groovy @@ -0,0 +1,103 @@ +package mvm.rya.prospector.service + +import mvm.rya.api.RdfCloudTripleStoreConfiguration +import mvm.rya.api.persist.RdfEvalStatsDAO +import mvm.rya.prospector.domain.TripleValueType +import mvm.rya.prospector.utils.ProspectorConstants +import org.apache.hadoop.conf.Configuration +import org.openrdf.model.Resource +import org.openrdf.model.Value + +import mvm.rya.api.persist.RdfEvalStatsDAO.CARDINALITY_OF + +/** + * An ${@link mvm.rya.api.persist.RdfEvalStatsDAO} that uses the Prospector Service underneath return counts. + */ +class ProspectorServiceEvalStatsDAO implements RdfEvalStatsDAO<RdfCloudTripleStoreConfiguration> { + + def ProspectorService prospectorService + + ProspectorServiceEvalStatsDAO() { + } + + ProspectorServiceEvalStatsDAO(ProspectorService prospectorService, RdfCloudTripleStoreConfiguration conf) { + this.prospectorService = prospectorService + } + + public ProspectorServiceEvalStatsDAO(def connector, RdfCloudTripleStoreConfiguration conf) { + this.prospectorService = new ProspectorService(connector, getProspectTableName(conf)) + } + + @Override + void init() { + assert prospectorService != null + } + + @Override + boolean isInitialized() { + return prospectorService != null + } + + @Override + void destroy() { + + } + + @Override + public double getCardinality(RdfCloudTripleStoreConfiguration conf, CARDINALITY_OF card, List<Value> val) { + + assert conf != null && card != null && val != null + String triplePart = null; + switch (card) { + case (CARDINALITY_OF.SUBJECT): + triplePart = TripleValueType.subject + break; + case (CARDINALITY_OF.PREDICATE): + triplePart = TripleValueType.predicate + break; + case (CARDINALITY_OF.OBJECT): + triplePart = TripleValueType.object + break; + case (CARDINALITY_OF.SUBJECTPREDICATE): + triplePart = TripleValueType.subjectpredicate + break; + case (CARDINALITY_OF.SUBJECTOBJECT): + triplePart = TripleValueType.subjectobject + break; + case (CARDINALITY_OF.PREDICATEOBJECT): + triplePart = TripleValueType.predicateobject + break; + } + + String[] auths = conf.getAuths() + List<String> indexedValues = new ArrayList<String>(); + Iterator<Value> valueIt = val.iterator(); + while (valueIt.hasNext()){ + indexedValues.add(valueIt.next().stringValue()); + } + + def indexEntries = prospectorService.query(null, ProspectorConstants.COUNT, triplePart, indexedValues, null /** what is the datatype here? */, + auths) + + return indexEntries.size() > 0 ? indexEntries.head().count : -1 + } + + @Override + double getCardinality(RdfCloudTripleStoreConfiguration conf, CARDINALITY_OF card, List<Value> val, Resource context) { + return getCardinality(conf, card, val) //TODO: Not sure about the context yet + } + + @Override + public void setConf(RdfCloudTripleStoreConfiguration conf) { + + } + + @Override + RdfCloudTripleStoreConfiguration getConf() { + return null + } + + public static String getProspectTableName(RdfCloudTripleStoreConfiguration conf) { + return conf.getTablePrefix() + "prospects"; + } +} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/utils/CustomEntry.groovy ---------------------------------------------------------------------- diff --git a/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/utils/CustomEntry.groovy b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/utils/CustomEntry.groovy new file mode 100644 index 0000000..4d7ae1d --- /dev/null +++ b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/utils/CustomEntry.groovy @@ -0,0 +1,33 @@ +package mvm.rya.prospector.utils + +/** + * Date: 12/3/12 + * Time: 12:33 PM + */ +class CustomEntry<K, V> implements Map.Entry<K, V> { + + K key; + V value; + + CustomEntry(K key, V value) { + this.key = key + this.value = value + } + + K getKey() { + return key + } + + void setKey(K key) { + this.key = key + } + + V getValue() { + return value + } + + V setValue(V value) { + this.value = value + this.value + } +} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/utils/ProspectorConstants.groovy ---------------------------------------------------------------------- diff --git a/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/utils/ProspectorConstants.groovy b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/utils/ProspectorConstants.groovy new file mode 100644 index 0000000..edca753 --- /dev/null +++ b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/utils/ProspectorConstants.groovy @@ -0,0 +1,22 @@ +package mvm.rya.prospector.utils + +/** + * Date: 12/5/12 + * Time: 10:57 AM + */ +class ProspectorConstants { + public static final String COUNT = "count" + public static final String METADATA = "metadata" + public static final String PROSPECT_TIME = "prospectTime" + public static final String DEFAULT_VIS = "U&FOUO" + public static final byte[] EMPTY = new byte [0]; + + //config properties + public static final String PERFORMANT = "performant" + + public static final String USERNAME = "username" + public static final String PASSWORD = "password" + public static final String INSTANCE = "instance" + public static final String ZOOKEEPERS = "zookeepers" + public static final String MOCK = "mock" +} http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/92ddfa59/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/utils/ProspectorUtils.groovy ---------------------------------------------------------------------- diff --git a/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/utils/ProspectorUtils.groovy b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/utils/ProspectorUtils.groovy new file mode 100644 index 0000000..ba90fa2 --- /dev/null +++ b/extras/rya.prospector/src/main/groovy/mvm/rya/prospector/utils/ProspectorUtils.groovy @@ -0,0 +1,119 @@ +package mvm.rya.prospector.utils + +import org.apache.accumulo.core.client.Connector +import org.apache.accumulo.core.client.Instance +import org.apache.accumulo.core.client.ZooKeeperInstance +import org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat +import org.apache.accumulo.core.client.mapreduce.AccumuloOutputFormat +import org.apache.accumulo.core.client.mock.MockInstance +import org.apache.accumulo.core.data.Mutation +import org.apache.accumulo.core.security.Authorizations +import org.apache.commons.lang.Validate +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.io.Text +import org.apache.hadoop.io.compress.GzipCodec +import org.apache.hadoop.mapreduce.Job + +import java.text.SimpleDateFormat +import mvm.rya.prospector.plans.IndexWorkPlan +import org.apache.accumulo.core.client.security.tokens.PasswordToken + +import static mvm.rya.prospector.utils.ProspectorConstants.* + +/** + * Date: 12/4/12 + * Time: 4:24 PM + */ +class ProspectorUtils { + + public static final long INDEXED_DATE_SORT_VAL = 999999999999999999L; // 18 char long, same length as date format pattern below + public static final String INDEXED_DATE_FORMAT = "yyyyMMddHHmmsssSSS"; + + public static String getReverseIndexDateTime(Date date) { + Validate.notNull(date); + String formattedDateString = new SimpleDateFormat(INDEXED_DATE_FORMAT).format(date); + long diff = INDEXED_DATE_SORT_VAL - Long.valueOf(formattedDateString); + + return Long.toString(diff); + } + + public static Map<String, IndexWorkPlan> planMap(def plans) { + plans.inject([:]) { map, plan -> + map.putAt(plan.indexType, plan) + map + } + } + + public static void initMRJob(Job job, String table, String outtable, String[] auths) { + Configuration conf = job.configuration + String username = conf.get(USERNAME) + String password = conf.get(PASSWORD) + String instance = conf.get(INSTANCE) + String zookeepers = conf.get(ZOOKEEPERS) + String mock = conf.get(MOCK) + + //input + if (Boolean.parseBoolean(mock)) { + AccumuloInputFormat.setMockInstance(job, instance) + AccumuloOutputFormat.setMockInstance(job, instance) + } else if (zookeepers != null) { + AccumuloInputFormat.setZooKeeperInstance(job, instance, zookeepers) + AccumuloOutputFormat.setZooKeeperInstance(job, instance, zookeepers) + } else { + throw new IllegalArgumentException("Must specify either mock or zookeepers"); + } + + AccumuloInputFormat.setConnectorInfo(job, username, new PasswordToken(password.getBytes())) + AccumuloInputFormat.setInputTableName(job, table) + job.setInputFormatClass(AccumuloInputFormat.class); + AccumuloInputFormat.setScanAuthorizations(job, new Authorizations(auths)) + + // OUTPUT + job.setOutputFormatClass(AccumuloOutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(Mutation.class); + AccumuloOutputFormat.setConnectorInfo(job, username, new PasswordToken(password.getBytes())) + AccumuloOutputFormat.setDefaultTableName(job, outtable) + } + + public static void addMRPerformance(Configuration conf) { + conf.setBoolean("mapred.map.tasks.speculative.execution", false); + conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); + conf.set("io.sort.mb", "256"); + conf.setBoolean("mapred.compress.map.output", true); + conf.set("mapred.map.output.compression.codec", GzipCodec.class.getName()); + } + + public static Instance instance(Configuration conf) { + assert conf != null + + String instance_str = conf.get(INSTANCE) + String zookeepers = conf.get(ZOOKEEPERS) + String mock = conf.get(MOCK) + if (Boolean.parseBoolean(mock)) { + return new MockInstance(instance_str) + } else if (zookeepers != null) { + return new ZooKeeperInstance(instance_str, zookeepers) + } else { + throw new IllegalArgumentException("Must specify either mock or zookeepers"); + } + } + + public static Connector connector(Instance instance, Configuration conf) { + String username = conf.get(USERNAME) + String password = conf.get(PASSWORD) + if (instance == null) + instance = instance(conf) + return instance.getConnector(username, password) + } + + public static void writeMutations(Connector connector, String tableName, def mutations) { + def bw = connector.createBatchWriter(tableName, 10000l, 10000l, 4); + mutations.each { m -> + bw.addMutation(m) + } + bw.flush() + bw.close() + } + +}
