Author: jerome
Date: Wed Aug 17 09:10:23 2005
New Revision: 233192
URL: http://svn.apache.org/viewcvs?rev=233192&view=rev
Log:
Fix the issue reported by Andrzej Bialecki in
http://www.mail-archive.com/nutch-dev%40lucene.apache.org/msg00065.html.
The HTMLLanguageParser:
* can now extract the language code from a string not compliant with rfc1766
* check that the extracted language is a valid language code.
Modified:
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
Modified:
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=233192&r1=233191&r2=233192&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
Wed Aug 17 09:10:23 2005
@@ -14,23 +14,62 @@
* limitations under the License.
*/
package org.apache.nutch.analysis.lang;
+
+// JDK imports
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Properties;
+import java.util.logging.Logger;
+
+// Nutch imports
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.protocol.Content;
-import org.w3c.dom.*;
-
-import java.util.logging.Logger;
import org.apache.nutch.util.LogFormatter;
-/** Adds metadata identifying language of document if found
+// DOM imports
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+
+/**
+ * Adds metadata identifying language of document if found
* We could also run statistical analysis here but we'd miss all other formats
*/
public class HTMLLanguageParser implements HtmlParseFilter {
+
public static final String META_LANG_NAME="X-meta-lang";
public static final Logger LOG = LogFormatter
.getLogger(HTMLLanguageParser.class.getName());
+ /* A static Map of ISO-639 language codes */
+ private static Map LANGUAGES_MAP = new HashMap();
+ static {
+ try {
+ Properties p = new Properties();
+ p.load(HTMLLanguageParser.class
+
.getResourceAsStream("langmappings.properties"));
+ Enumeration keys = p.keys();
+ while (keys.hasMoreElements()) {
+ String key = (String) keys.nextElement();
+ String[] values = p.getProperty(key).split(",", -1);
+ LANGUAGES_MAP.put(key, key);
+ for (int i=0; i<values.length; i++) {
+ LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key);
+ }
+ }
+ } catch (Exception e) {
+ LOG.severe(e.toString());
+ }
+ }
+
+
+
/**
* Scan the HTML document looking at possible indications of content
language<br>
* <li>1. html lang attribute
(http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1)
@@ -39,60 +78,122 @@
* <br>Only the first occurence of language is stored.
*/
public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags,
DocumentFragment doc) {
- String lang = findLanguage(doc);
+
+ // Trying to find the document's language
+ LanguageParser parser = new LanguageParser(doc);
+ String lang = parser.getLanguage();
if (lang != null) {
parse.getData().getMetadata().put(META_LANG_NAME, lang);
}
-
return parse;
}
-
- private String findLanguage(Node node) {
- String lang = null;
- if (node.getNodeType() == Node.ELEMENT_NODE) {
-
- //lang attribute
- lang = ((Element) node).getAttribute("lang");
- if (lang != null && lang.length()>1) {
- return lang;
- }
- if ("meta".equalsIgnoreCase(node.getNodeName())) {
+ static class LanguageParser {
+
+ private String dublinCore = null;
+ private String htmlAttribute = null;
+ private String httpEquiv = null;
+ private String language = null;
+
+ LanguageParser(Node node) {
+ parse(node);
+ if (htmlAttribute != null) { language = htmlAttribute; }
+ else if (dublinCore != null) { language = dublinCore; }
+ else {language = httpEquiv; }
+ }
+
+ String getLanguage() {
+ return language;
+ }
+
+ void parse(Node node) {
- NamedNodeMap attrs=node.getAttributes();
+ String lang = null;
+
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+ // Check for the lang HTML attribute
+ if (htmlAttribute == null) {
+ htmlAttribute = parseLanguage(((Element) node).getAttribute("lang"));
+ }
- //dc.language
- for(int i=0;i<attrs.getLength();i++){
- Node attrnode=attrs.item(i);
- if("name".equalsIgnoreCase(attrnode.getNodeName())){
- if("dc.language".equalsIgnoreCase(attrnode.getNodeValue())){
- Node valueattr=attrs.getNamedItem("content");
- lang = (valueattr!=null)?valueattr.getNodeValue():null;
+ // Check for Meta
+ if ("meta".equalsIgnoreCase(node.getNodeName())) {
+ NamedNodeMap attrs = node.getAttributes();
+
+ // Check for the dc.language Meta
+ if (dublinCore == null) {
+ for (int i=0; i<attrs.getLength(); i++) {
+ Node attrnode = attrs.item(i);
+ if ("name".equalsIgnoreCase(attrnode.getNodeName())) {
+ if ("dc.language".equalsIgnoreCase(attrnode.getNodeValue())) {
+ Node valueattr = attrs.getNamedItem("content");
+ if (valueattr != null) {
+ dublinCore = parseLanguage(valueattr.getNodeValue());
+ }
+ }
+ }
}
}
- }
-
- //http-equiv content-language
- for(int i=0;i<attrs.getLength();i++){
- Node attrnode=attrs.item(i);
- if("http-equiv".equalsIgnoreCase(attrnode.getNodeName())){
-
if("content-language".equals(attrnode.getNodeValue().toLowerCase())){
- Node valueattr=attrs.getNamedItem("content");
- lang = (valueattr!=null)?valueattr.getNodeValue():null;
+
+ // Check for the http-equiv content-language
+ if (httpEquiv == null) {
+ for (int i=0; i<attrs.getLength(); i++){
+ Node attrnode = attrs.item(i);
+ if ("http-equiv".equalsIgnoreCase(attrnode.getNodeName())) {
+ if
("content-language".equals(attrnode.getNodeValue().toLowerCase())) {
+ Node valueattr = attrs.getNamedItem("content");
+ if (valueattr != null) {
+ httpEquiv = parseLanguage(valueattr.getNodeValue());
+ }
+ }
+ }
}
}
}
}
+
+ // Recurse
+ NodeList children = node.getChildNodes();
+ for (int i=0; children != null && i<children.getLength(); i++) {
+ parse(children.item(i));
+ if ((dublinCore != null) &&
+ (htmlAttribute != null) &&
+ (httpEquiv != null)) {
+ return;
+ }
+ }
}
-
- //recurse
- NodeList children = node.getChildNodes();
- for (int i = 0; children != null && i < children.getLength(); i++) {
- lang = findLanguage(children.item(i));
- if(lang != null && lang.length()>1) return lang;
+
+ /**
+ * Parse a language string and return an ISO 639 primary code,
+ * or <code>null</code> if something wrong occurs, or if no language is
found.
+ */
+ final static String parseLanguage(String lang) {
+
+ if (lang == null) { return null; }
+
+ String code = null;
+ String language = null;
+
+ // First, split multi-valued values
+ String langs[] = lang.split(",| |;|\\.|\\(|\\)|=", -1);
+
+ int i = 0;
+ while ((language == null) && (i<langs.length)) {
+ // Then, get the primary code
+ code = langs[i].split("-")[0];
+ code = code.split("_")[0];
+ // Find the ISO 639 code
+ language = (String) LANGUAGES_MAP.get(code.toLowerCase());
+ i++;
+ }
+
+ return language;
}
-
- return lang;
+
}
+
+
}
Modified:
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties?rev=233192&r1=233191&r2=233192&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties
(original)
+++
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties
Wed Aug 17 09:10:23 2005
@@ -1,187 +1,188 @@
-aa=aar
-ab=abk
+# Defines some mapping between common erroneous languages codes and
+# the ISO 639 two-letters language codes.
+aa=aar,Afar
+ab=abk,Abkhazian
ae=ave
-af=afr
+af=afr,Afrikaans
ak=aka
-am=amh
+am=amh,Amharic
an=arg
-ar=ara
-as=asm
+ar=ara,Arabic
+as=asm,Assamese
av=ava
-ay=aym
-az=aze
-ba=bak
-be=bel
-bg=bul
-bh=bih
-bi=bis
+ay=aym,Aymara
+az=aze,Azerbaijani
+ba=bak,Bashkir
+be=bel,Byelorussian
+bg=bul,Bulgarian
+bh=bih,Bihari
+bi=bis,Bislama
bm=bam
-bn=ben
-bo=tib/bod
-br=bre
+bn=ben,Bengali
+bo=bod,tib,Tibetan
+br=bre,Breton
bs=bos
-ca=cat
+ca=cat,Catalan
ce=che
ch=cha
-co=cos
+co=cos,Corsican
cr=cre
-cs=cze/ces
+cs=ces,cze,Czech
cu=chu
cv=chv
-cy=wel/cym
-da=dan
-de=ger/deu
+cy=cym,wel,Welsh
+da=dan,Danish
+de=deu,ger,German
dv=div
-dz=dzo
+dz=dzo,Dzongkha
ee=ewe
-el=gre/ell
-en=eng
-eo=epo
-es=spa
-et=est
-eu=baq/eus
-fa=per/fas
+el=ell,gre,Greek
+en=eng,English
+eo=epo,Esperanto
+es=esl,spa,Spanish
+et=est,Estonian
+eu=baq,eus,Basque
+fa=fas,per,Persian
ff=ful
-fi=fin
-fj=fij
-fo=fao
-fr=fre/fra
-fy=fry
-ga=gle
+fi=fin,Finnish
+fj=fij,Fijian
+fo=fao,Faroese
+fr=fra,fre,French
+fy=fry,Frisian
+ga=gai,iri,Irish
gd=gla
-gl=glg
-gn=grn
-gu=guj
+gl=glg,Gallegan
+gn=grn,Guarani
+gu=guj,Gujarati
gv=glv
-ha=hau
-he=heb
-hi=hin
+ha=hau,Hausa
+he=heb,Hebrew
+hi=hin,Hindi
ho=hmo
-hr=scr/hrv
+hr=scr,hrv,Croatian
ht=hat
-hu=hun
-hy=arm/hye
+hu=hun,Hungarian
+hy=arm,hye,Armenian
hz=her
-ia=ina
-id=ind
+ia=ina,Interlingua
+id=ind,Indonesian
ie=ile
ig=ibo
ii=iii
-ik=ipk
+ik=ipk,Inupiak
io=ido
-is=ice/isl
-it=ita
-iu=iku
-ja=jpn
-jv=jav
-ka=geo/kat
+is=ice,isl,Icelandic
+it=ita,Italian
+iu=iku,Inuktitut
+ja=jpn,Japanese
+jv=jw,jav,jaw,Javanese
+ka=geo,kat,Georgian
kg=kon
ki=kik
-ki=kik
-kj=kua
kj=kua
-kk=kaz
-kl=kal
-km=khm
-kn=kan
-ko=kor
+kk=kaz,Kazakh
+kl=kal,Greenlandic
+km=khm,Khmer
+kn=kan,Kannada
+ko=kor,Korean
kr=kau
-ks=kas
-ku=kur
+ks=kas,Kashmiri
+ku=kur,Kurdish
kv=kom
kw=cor
-ky=kir
-la=lat
+ky=kir,Kirghiz
+la=lat,Latin
lb=ltz
lg=lug
li=lim
-ln=lin
-lo=lao
-lt=lit
+ln=lin,Lingala
+lo=lao,Lao
+lt=lit,Lithuanian
lu=lub
-lv=lav
-mg=mlg
+lv=lav,Latvian
+mg=mlg,Malagasy
mh=mah
-mi=mao/mri
-mk=mac/mkd
-ml=mal
-mn=mon
-mo=mol
-mr=mar
-ms=may/msa
+mi=mao,mri,Maori
+mk=mac,mak,Macedonian
+ml=mal,mlt,Maltese
+mn=mon,Mongolian
+mo=mol,Moldavian
+mr=mar,Marathi
+ms=may,msa,Malay
mt=mlt
-my=bur/mya
-na=nau
+my=bur,mya,Burmese
+na=nau,Nauru
nb=nob
nd=nde
-ne=nep
+ne=nep,Nepali
ng=ndo
-nl=dut/nld
+nl=dut,nla,Dutch
nn=nno
-no=nor
+no=nor,Norwegian
nr=nbl
nv=nav
ny=nya
-oc=oci
+oc=oci,Langue d'Oc
oj=oji
-om=orm
-or=ori
+om=orm,Oromo
+or=ori,Oriya
os=oss
-pa=pan
+pa=pan,Panjabi
pi=pli
-pl=pol
-ps=pus
-pt=por
-qu=que
-rm=roh
-rn=run
-ro=rum/ron
-ru=rus
-rw=kin
-sa=san
+pl=pol,Polish
+ps=pus,Pushto
+pt=por,Portuguese
+qu=que,Quechua
+rm=roh,Rhaeto-Romance
+rn=run,Rundi
+ro=ron,rum,Romanian
+ru=rus,Russian
+rw=kin,Kinyarwanda
+sa=san,Sanskrit
sc=srd
-sd=snd
+sd=snd,Sindhi
se=sme
-sg=sag
-si=sin
-sk=slo/slk
-sl=slv
-sm=smo
-sn=sna
-so=som
-sq=alb/sqi
-sr=scc/srp
-ss=ssw
-st=sot
-su=sun
-sv=swe
-sw=swa
-ta=tam
-te=tel
-tg=tgk
-th=tha
-ti=tir
-tk=tuk
-tl=tgl
-tn=tsn
-to=ton
-tr=tur
-ts=tso
-tt=tat
-tw=twi
+sg=sag,Sango
+sh=scr,Serbo-Croatian
+si=sin,Singhalese
+sk=slk,slo,Slovak
+sl=slv,Slovenian
+sm=smo,Samoan
+sn=sna,Shona
+so=som,Somali
+sq=alb,sqi,Albanian
+sr=scc,srp,Serbian
+ss=ssw,Siswant
+st=sot,Sotho
+su=sun,Sudanese
+sv=sve,swe,Swedish,Svenska,Sweden
+sw=swa,Swahili
+ta=tam,Tamil
+te=tel,Telugu
+tg=tgk,Tajik
+th=tha,Thai
+ti=tir,Tigrinya
+tk=tuk,Turkmen
+tl=tgl,Tagalog
+tn=tsn,Tswana
+to=tog,Tonga
+tr=tur,Turkish
+ts=tso,Tsonga
+tt=tat,Tatar
+tw=twi,Twi
ty=tah
-ug=uig
-uk=ukr
-ur=urd
-uz=uzb
+ug=uig,Uighur
+uk=ukr,Ukrainian
+ur=urd,Urdu
+uz=uzb,Uzbek
ve=ven
-vi=vie
-vo=vol
+vi=vie,Vietnamese
+vo=vol,Volapk
wa=wln
-wo=wol
-xh=xho
-yi=yid
-yo=yor
-za=zha
-zh=chi/zho
-zu=zul
+wo=wol,Wolof
+xh=xho,Xhosa
+yi=yidYiddish
+yo=yor,Yoruba
+za=zha,Zhuang
+zh=chi,zho,Chinese
+zu=zul,Zulu
Modified:
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=233192&r1=233191&r2=233192&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
Wed Aug 17 09:10:23 2005
@@ -15,14 +15,19 @@
*/
package org.apache.nutch.analysis.lang;
+// JDK imports
import java.util.Properties;
+// JUnit imports
import junit.framework.TestCase;
+
+// Nutch imports
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.protocol.Content;
+
public class TestHTMLLanguageParser extends TestCase {
private static String URL = "http://foo.bar/";
@@ -61,6 +66,61 @@
}
+ /** Test of <code>LanguageParser.parseLanguage(String)</code> method. */
+ public void testParseLanguage() {
+ String tests[][] = {
+ { "(SCHEME=ISO.639-1) sv", "sv" },
+ { "(SCHEME=RFC1766) sv-FI", "sv" },
+ { "(SCHEME=Z39.53) SWE", "sv" },
+ { "EN_US, SV, EN, EN_UK", "en" },
+ { "English Swedish", "en" },
+ { "English, swedish", "en" },
+ { "English,Swedish", "en" },
+ { "Other (Svenska)", "sv" },
+ { "SE", "se" },
+ { "SV", "sv" },
+ { "SV charset=iso-8859-1", "sv" },
+ { "SV-FI", "sv" },
+ { "SV; charset=iso-8859-1", "sv" },
+ { "SVE", "sv" },
+ { "SW", "sw" },
+ { "SWE", "sv" },
+ { "SWEDISH", "sv" },
+ { "Sv", "sv" },
+ { "Sve", "sv" },
+ { "Svenska", "sv" },
+ { "Swedish", "sv" },
+ { "Swedish, svenska", "sv" },
+ { "en, sv", "en" },
+ { "sv", "sv" },
+ { "sv, be, dk, de, fr, no, pt, ch, fi, en", "sv" },
+ { "sv,en", "sv" },
+ { "sv-FI", "sv" },
+ { "sv-SE", "sv" },
+ { "sv-en", "sv" },
+ { "sv-fi", "sv" },
+ { "sv-se", "sv" },
+ { "sv; Content-Language: sv", "sv" },
+ { "sv_SE", "sv" },
+ { "sve", "sv" },
+ { "svenska, swedish, engelska, english", "sv" },
+ { "sw", "sw" },
+ { "swe", "sv" },
+ { "swe.SPR.", "sv" },
+ { "sweden", "sv" },
+ { "swedish", "sv" },
+ { "swedish,", "sv" },
+ { "text/html; charset=sv-SE", "sv" },
+ { "text/html; sv", "sv" },
+ { "torp, stuga, uthyres, bed & breakfast", null }
+ };
+
+ for (int i=0; i<44; i++) {
+ assertEquals(tests[i][1],
HTMLLanguageParser.LanguageParser.parseLanguage(tests[i][0]));
+ }
+ }
+
+
private Content getContent(String text) {
Properties p = new Properties();
p.put("Content-Type", "text/html");
@@ -68,4 +128,5 @@
Content content = new Content(URL, BASE, text.getBytes(), "text/html", p);
return content;
}
+
}