jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/362159 )
Change subject: Rename project_class to project_family
......................................................................
Rename project_class to project_family
In host normalization, we used the project_class name.
This project_class name is now changed to project_family
(rethought when working unique devices).
This patch updates the webrequest refinement functions to
now provide both project_class AND project_family, in a plan
to remove project_class in some future time.
Bug: T168874
Change-Id: If2a3a285c6a4194be9287dfca2b9dbaada99a916
---
M
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/NormalizedHostInfo.java
M
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
M
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java
M refinery-core/src/test/resources/normalize_host_test_data.csv
M
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetHostPropertiesUDF.java
M
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/HostNormalizerUDF.java
M
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetHostPropertiesUDF.java
7 files changed, 36 insertions(+), 24 deletions(-)
Approvals:
Joal: Looks good to me, approved
jenkins-bot: Verified
diff --git
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/NormalizedHostInfo.java
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/NormalizedHostInfo.java
index 06f5833..774dd91 100644
---
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/NormalizedHostInfo.java
+++
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/NormalizedHostInfo.java
@@ -13,24 +13,24 @@
*/
public static final String EMPTY_NORM_HOST_VALUE = "-";
- private String projectClass;
+ private String projectFamily;
private String project;
private List<String> qualifiers;
private String tld;
public NormalizedHostInfo() {
- projectClass = EMPTY_NORM_HOST_VALUE;
+ projectFamily = EMPTY_NORM_HOST_VALUE;
project = EMPTY_NORM_HOST_VALUE;
qualifiers = new ArrayList<>();
tld = EMPTY_NORM_HOST_VALUE;
}
- public String getProjectClass() {
- return projectClass;
+ public String getProjectFamily() {
+ return projectFamily;
}
- public void setProjectClass(String projectClass) {
- this.projectClass = projectClass;
+ public void setProjectFamily(String projectFamily) {
+ this.projectFamily = projectFamily;
}
public String getProject() {
diff --git
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
index e3667ee..d509872 100644
---
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
+++
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
@@ -233,14 +233,14 @@
* Example: normalizeHost("en.m.zero.wikipedia.org")<br/>
* Returns:<br/>
* NormalizedHostInfo(
- * "project_class":"wikipedia",
+ * "projectFamily":"wikipedia",
* "project":"en",
* "qualifiers":["m", "zero"],
* "tld":"org",
* )
*
* @param uriHost The url's host
- * @return A NormalizedHostInfo object with project_class, project,
qualifiers and tld values set.
+ * @return A NormalizedHostInfo object with projectFamily, project,
qualifiers and tld values set.
*/
public NormalizedHostInfo normalizeHost(String uriHost) {
@@ -272,8 +272,8 @@
if (uriParts[uriParts.length - 1].matches("[0-9]+")) return result;
if (uriParts.length > 1) {
- // project_class and TLD normalization
- result.setProjectClass(uriParts[uriParts.length - 2]);
+ // project_family and TLD normalization
+ result.setProjectFamily(uriParts[uriParts.length - 2]);
result.setTld(uriParts[uriParts.length - 1]);
}
// project normalization
diff --git
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java
index b3d1d66..05f89d3 100644
---
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java
+++
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java
@@ -126,7 +126,7 @@
assertEquals(
test_description + " - Project Class",
expectedProjectClass,
- webrequest_inst.normalizeHost(uriHost).getProjectClass()
+ webrequest_inst.normalizeHost(uriHost).getProjectFamily()
);
assertEquals(
test_description + " - Project",
@@ -155,7 +155,7 @@
assertEquals(
"Null - Project Class",
NormalizedHostInfo.EMPTY_NORM_HOST_VALUE,
- webrequest_inst.normalizeHost(testUriHost).getProjectClass()
+ webrequest_inst.normalizeHost(testUriHost).getProjectFamily()
);
assertEquals(
"Null - Project",
@@ -179,7 +179,7 @@
assertEquals(
"Empty - Project Class",
NormalizedHostInfo.EMPTY_NORM_HOST_VALUE,
- webrequest_inst.normalizeHost(testUriHost).getProjectClass()
+ webrequest_inst.normalizeHost(testUriHost).getProjectFamily()
);
assertEquals(
"Empty - Project",
diff --git a/refinery-core/src/test/resources/normalize_host_test_data.csv
b/refinery-core/src/test/resources/normalize_host_test_data.csv
index bae97da..da162ef 100644
--- a/refinery-core/src/test/resources/normalize_host_test_data.csv
+++ b/refinery-core/src/test/resources/normalize_host_test_data.csv
@@ -1,4 +1,4 @@
-test_description,expected_project_class,expected_project,expected_qualifiers,expected_tld,uri_host
+test_description,expected_project_family,expected_project,expected_qualifiers,expected_tld,uri_host
IP,-,-,,-,192.168.0.1
No Dot,-,-,,-,wikipedia
One dot capital,wikipedia,-,,org,Wikipedia.org
diff --git
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetHostPropertiesUDF.java
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetHostPropertiesUDF.java
index 87f1b5a..d52fdc3 100644
---
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetHostPropertiesUDF.java
+++
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetHostPropertiesUDF.java
@@ -14,7 +14,6 @@
import
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import
org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
-import org.apache.log4j.Logger;
import org.wikimedia.analytics.refinery.core.*;
import java.util.*;
@@ -22,11 +21,13 @@
/**
* UDF that normalizes host (lower case, split) and returns a struct.
* Records are processed one by one.<p/>
+ * NOTE: project_class is renamed to project_family - we currently provide
both.
* Example:<br/>
* SELECT get_host_properties('en.m.zero.wikipedia.org') FROM test_table LIMIT
1;<br/>
* Returns:<br/>
* {
* "project_class":"wikipedia",
+ * "project_family":"wikipedia",
* "project":"en",
* "qualifiers":["m", "zero"],
* "tld":"org",
@@ -39,7 +40,7 @@
@UDFType(deterministic = true)
@Description(name = "get_host_properties", value = "_FUNC_(uri_host) - "
- + "Returns a map with project_class, project, qualifiers, tld keys and
"
+ + "Returns a map with project_family, project, qualifiers, tld keys
and "
+ "the appropriate values for each of them")
public class GetHostPropertiesUDF extends GenericUDF {
private Object[] result;
@@ -49,6 +50,7 @@
private StringObjectInspector argumentOI;
private int IDX_PROJECT_CLASS;
+ private int IDX_PROJECT_FAMILY;
private int IDX_PROJECT;
private int IDX_QUALIFIERS;
private int IDX_TLD;
@@ -111,6 +113,10 @@
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
IDX_PROJECT_CLASS=idx++;
+ fieldNames.add("project_family");
+
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
+ IDX_PROJECT_FAMILY=idx++;
+
fieldNames.add("project");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
IDX_PROJECT=idx++;
@@ -168,11 +174,13 @@
if (normHost == null) {
result[IDX_PROJECT_CLASS] =
NormalizedHostInfo.EMPTY_NORM_HOST_VALUE;
+ result[IDX_PROJECT_FAMILY] =
NormalizedHostInfo.EMPTY_NORM_HOST_VALUE;
result[IDX_PROJECT] = NormalizedHostInfo.EMPTY_NORM_HOST_VALUE;
result[IDX_QUALIFIERS] = new ArrayList<String>();
result[IDX_TLD] = NormalizedHostInfo.EMPTY_NORM_HOST_VALUE;
} else {
- result[IDX_PROJECT_CLASS] = normHost.getProjectClass();
+ result[IDX_PROJECT_CLASS] = normHost.getProjectFamily();
+ result[IDX_PROJECT_FAMILY] = normHost.getProjectFamily();
result[IDX_PROJECT] = normHost.getProject();
result[IDX_QUALIFIERS] = normHost.getQualifiers();
result[IDX_TLD] = normHost.getTld();
diff --git
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/HostNormalizerUDF.java
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/HostNormalizerUDF.java
index 975a7d7..fc91474 100644
---
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/HostNormalizerUDF.java
+++
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/HostNormalizerUDF.java
@@ -7,11 +7,13 @@
* Deprecated - Use GetHostPropertiesUDF
* UDF that normalizes host (lower case, split) and returns a struct.
* Records are processed one by one.<p/>
+ * NOTE: project_class is renamed to project_family - we currently provide
both.
* Example:<br/>
* SELECT normalize_host('en.m.zero.wikipedia.org') FROM test_table LIMIT
1;<br/>
* Returns:<br/>
* {
* "project_class":"wikipedia",
+ * "project_family":"wikipedia",
* "project":"en",
* "qualifiers":["m", "zero"],
* "tld":"org",
@@ -25,6 +27,6 @@
@Deprecated
@UDFType(deterministic = true)
@Description(name = "normalize_host", value = "_FUNC_(uri_host) - "
- + "Returns a map with project_class, project, qualifiers, tld keys and
"
+ + "Returns a map with project_family, project, qualifiers, tld keys
and "
+ "the appropriate values for each of them")
public class HostNormalizerUDF extends GetHostPropertiesUDF {}
\ No newline at end of file
diff --git
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetHostPropertiesUDF.java
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetHostPropertiesUDF.java
index f35701d..fe7c2b8 100644
---
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetHostPropertiesUDF.java
+++
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetHostPropertiesUDF.java
@@ -67,7 +67,7 @@
)
public void testNormalizeHost(
String test_description,
- String expectedProjectClass,
+ String expectedProjectFamily,
String expectedProject,
String expectedQualifiers,
String expectedTld,
@@ -81,12 +81,14 @@
DeferredObject[] args = new DeferredObject[] { new
DeferredJavaObject(uriHost) };
Object[] res = (Object[]) getHostPropertiesUDF.evaluate(args);
- assertEquals("Result array has wrong length", 4, res.length);
+ // Hacked for normalized_host.project_family change
+ assertEquals("Result array has wrong length", 5, res.length);
- assertEquals(test_description + " - ProjectClass",
expectedProjectClass, res[0]);
- assertEquals(test_description + " - Project ", expectedProject,
res[1]);
- assertEquals(test_description + " - Qualifiers", expectedQualifiers,
join((List<String>)res[2], ";"));
- assertEquals(test_description + " - TLD", expectedTld, res[3]);
+ assertEquals(test_description + " - ProjectClass",
expectedProjectFamily, res[0]);
+ assertEquals(test_description + " - ProjectFamily",
expectedProjectFamily, res[1]);
+ assertEquals(test_description + " - Project ", expectedProject,
res[2]);
+ assertEquals(test_description + " - Qualifiers", expectedQualifiers,
join((List<String>)res[3], ";"));
+ assertEquals(test_description + " - TLD", expectedTld, res[4]);
getHostPropertiesUDF.close();
}
--
To view, visit https://gerrit.wikimedia.org/r/362159
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: If2a3a285c6a4194be9287dfca2b9dbaada99a916
Gerrit-PatchSet: 2
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>
Gerrit-Reviewer: Joal <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits