moved to commons the quick (and dirty) trick to encode latin1 string from freebase
Project: http://git-wip-us.apache.org/repos/asf/marmotta/repo Commit: http://git-wip-us.apache.org/repos/asf/marmotta/commit/b00e935d Tree: http://git-wip-us.apache.org/repos/asf/marmotta/tree/b00e935d Diff: http://git-wip-us.apache.org/repos/asf/marmotta/diff/b00e935d Branch: refs/heads/ldp Commit: b00e935d0fce5944367ed6f41f8ae108e008edf3 Parents: c5f0d76 Author: Sergio Fernández <[email protected]> Authored: Thu Apr 24 20:16:07 2014 +0200 Committer: Sergio Fernández <[email protected]> Committed: Thu Apr 24 20:16:07 2014 +0200 ---------------------------------------------------------------------- .../marmotta/commons/util/StringUtils.java | 40 ++++++++++++++++++++ .../ldclient/ldclient-provider-freebase/pom.xml | 4 ++ .../provider/freebase/FreebaseProvider.java | 20 ++-------- 3 files changed, 47 insertions(+), 17 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/marmotta/blob/b00e935d/commons/marmotta-commons/src/main/java/org/apache/marmotta/commons/util/StringUtils.java ---------------------------------------------------------------------- diff --git a/commons/marmotta-commons/src/main/java/org/apache/marmotta/commons/util/StringUtils.java b/commons/marmotta-commons/src/main/java/org/apache/marmotta/commons/util/StringUtils.java new file mode 100644 index 0000000..3e597c4 --- /dev/null +++ b/commons/marmotta-commons/src/main/java/org/apache/marmotta/commons/util/StringUtils.java @@ -0,0 +1,40 @@ +package org.apache.marmotta.commons.util; + +/** + * Some util string functions + * + * @author Sergio Fernández + */ +public class StringUtils { + + private StringUtils() { + + } + + public static String fixLatin1(String str) { + //TODO: find a way to re-code properly the literal + //http://www.ic.unicamp.br/~stolfi/EXPORT/www/ISO-8859-1-Encoding.html + str = str.replaceAll("\\\\xe1", "á"); + str = str.replaceAll("\\\\xe2", "â"); + str = str.replaceAll("\\\\xe3", "ã"); + str = str.replaceAll("\\\\xe4", "ä"); + str = str.replaceAll("\\\\xe7", "ç"); + str = str.replaceAll("\\\\xe8", "è"); + str = str.replaceAll("\\\\xe9", "é"); + str = str.replaceAll("\\\\xea", "ê"); + str = str.replaceAll("\\\\xeb", "ë"); + str = str.replaceAll("\\\\xed", "Ã"); + str = str.replaceAll("\\\\xee", "î"); + str = str.replaceAll("\\\\xef", "ï"); + str = str.replaceAll("\\\\xf3", "ó"); + str = str.replaceAll("\\\\xf4", "ô"); + str = str.replaceAll("\\\\xf6", "ö"); + str = str.replaceAll("\\\\xf9", "ù"); + str = str.replaceAll("\\\\xfb", "û"); + str = str.replaceAll("\\\\xfc", "ü"); + str = str.replaceAll("\\\\xfa", "ú"); + str = str.replaceAll("\\\\x", ""); //FIXME: wrong, wrong, wrong! + return str; + } + +} http://git-wip-us.apache.org/repos/asf/marmotta/blob/b00e935d/libraries/ldclient/ldclient-provider-freebase/pom.xml ---------------------------------------------------------------------- diff --git a/libraries/ldclient/ldclient-provider-freebase/pom.xml b/libraries/ldclient/ldclient-provider-freebase/pom.xml index 39f9848..5cba92c 100644 --- a/libraries/ldclient/ldclient-provider-freebase/pom.xml +++ b/libraries/ldclient/ldclient-provider-freebase/pom.xml @@ -42,6 +42,10 @@ <groupId>org.openrdf.sesame</groupId> <artifactId>sesame-rio-turtle</artifactId> </dependency> + <dependency> + <groupId>org.apache.marmotta</groupId> + <artifactId>marmotta-commons</artifactId> + </dependency> <dependency> <groupId>junit</groupId> http://git-wip-us.apache.org/repos/asf/marmotta/blob/b00e935d/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java ---------------------------------------------------------------------- diff --git a/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java b/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java index ef3652b..fe39296 100644 --- a/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java +++ b/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java @@ -183,7 +183,7 @@ public class FreebaseProvider extends AbstractHttpProvider { } else if (o.contains("\\u")) { o = StringEscapeUtils.unescapeJava(o); } else if (o.contains("\\x")) { - o = fixLatin1(o); + o = org.apache.marmotta.commons.util.StringUtils.fixLatin1(o); } sb.append(" " + p + " " + o + tripleMatcher.group(3)); sb.append("\n"); @@ -213,7 +213,7 @@ public class FreebaseProvider extends AbstractHttpProvider { //wrong charset if (literal.contains("\\x")) { - literal = fixLatin1(literal); + literal = org.apache.marmotta.commons.util.StringUtils.fixLatin1(literal); } //wrong unicode encoding @@ -224,18 +224,4 @@ public class FreebaseProvider extends AbstractHttpProvider { return literal; } - private String fixLatin1(String str) { - //TODO: find a way to re-code properly the literal - //http://www.ic.unicamp.br/~stolfi/EXPORT/www/ISO-8859-1-Encoding.html - str = str.replaceAll("\\\\xe1", "á"); - str = str.replaceAll("\\\\xe3", "ã"); - str = str.replaceAll("\\\\xe7", "ç"); - str = str.replaceAll("\\\\xe9", "é"); - str = str.replaceAll("\\\\xed", "Ã"); - str = str.replaceAll("\\\\xf3", "ó"); - str = str.replaceAll("\\\\xfa", "ú"); - str = str.replaceAll("\\\\x", ""); //FIXME: wrong, wrong, wrong! - return str; - } - -} \ No newline at end of file +}
