--- /Volumes/Local/Users/amaxwell/Downloads/BDSKCOinSParser.m 2009-07-15 09:34:41.000000000 -0700 +++ /Volumes/Local/Users/amaxwell/build/bibdesk-tco/BDSKCOinSParser.m 2009-07-15 10:41:33.000000000 -0700 @@ -35,7 +35,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ + */ #import "BDSKCOinSParser.h" #import @@ -43,7 +43,7 @@ /* The COinS or Z3988 format is a microformat which is embedded in web pages to include bibliographic information there. - + The data it transparts are stored in the title attribute of a span tag which has the class Z3988. That string is separated into fields by & strings. Each field contains a = with the string coming before it being the field name and the string coming after it being a (presumably UTF-8) percent encoded string. As COinS lacks any formal specification of what can/shoud/must occur when and where, parsing it is mostly an effort in heuristics (there is a lazy POS pseudo-spec). Implementations on web sites differ greatly as well. One supposes due to both the poor specification and the incompetence of the people doing the implementation. @@ -57,18 +57,18 @@ . Implementation for search results with plenty of junk characters in them: https://opacplus.bsb-muenchen.de/ . For single entry pages on http://citeseerx.ist.psu.edu/ . Wikipedia articles with references in them, e.g. http://en.wikipedia.org/wiki/Library feature poor quality COinS tags (whose author names appear in duplicate because of the "spec"'s ambiguity. -*/ + */ @implementation BDSKCOinSParser /* - Claim that the can parse the document if its markup contains the string Z3988. - The xmlDocument parameter cannot be used for this as its parsing automatically removes empty elements such as the spans used by COinS. -*/ + Claim that the can parse the document if its markup contains the string Z3988. + The xmlDocument parameter cannot be used for this as its parsing automatically removes empty elements such as the spans used by COinS. + */ + (BOOL)canParseDocument:(DOMDocument *)domDocument xmlDocument:(NSXMLDocument *)xmlDocument fromURL:(NSURL *)url{ NSString *containsZ3988Node = @".//span[@class='Z3988']"; - + NSError *error = nil; BOOL nodecountisok = [[[xmlDocument rootElement] nodesForXPath:containsZ3988Node error:&error] count] > 0; @@ -78,24 +78,23 @@ /* - Returns the content of the relevant title attributes in the document. - - Get the source code of the DOMDocument and match things which vaguely look like COinS records in there. - The xmlDocument variable cannot be used for this as its parsing automatically removes empty elements such as the spans used by COinS. - Matching of the relevant spans isn't theoretically perfect yet. If someone can write a regexp matching the title attribute of a span tag only if the class attribute of the tag contains the word Z3988, that may be more elegant. -*/ + Returns the content of the relevant title attributes in the document. + + Get the source code of the DOMDocument and match things which vaguely look like COinS records in there. + The xmlDocument variable cannot be used for this as its parsing automatically removes empty elements such as the spans used by COinS. + Matching of the relevant spans isn't theoretically perfect yet. If someone can write a regexp matching the title attribute of a span tag only if the class attribute of the tag contains the word Z3988, that may be more elegant. + */ + (NSArray *) Z3988MatchesForDocument: (NSXMLDocument *) xmlDocument { NSError *error; NSString *Z3988Path = @".//span[@class='Z3988']"; NSArray *Z3988Nodes = [[xmlDocument rootElement] nodesForXPath:Z3988Path error:&error]; NSEnumerator *nodeEnum = [Z3988Nodes objectEnumerator]; - NSXMLNode *node; + NSXMLElement *node; NSMutableArray *dataArray = [NSMutableArray arrayWithCapacity:[Z3988Nodes count]]; while (node = [nodeEnum nextObject]) { - NSString *title = [node stringValueOfAttribute:@"title"]; - if ([NSString isEmptyString:title]) - [dataArray addObject:title]; + if ([node kind] == NSXMLElementKind && [node attributeForName:@"title"] != nil) + [dataArray addObject:[[node attributeForName:@"title"] XMLString]]; } return dataArray; @@ -104,8 +103,8 @@ /* - Convert publication type name from COinS to BibTeX names. -*/ + Convert publication type name from COinS to BibTeX names. + */ + (NSString *) convertType:(NSString *) type { // default to misc. For unknown values as well as 'document', 'unknown', NSString * BibTeXType = @"GEN"; @@ -118,7 +117,7 @@ else if ([type isEqualToString:@"preprint"]) { BibTeXType = BDSKUnpublishedString; } else if ([type isEqualToString:@"proceeding"]) { BibTeXType = BDSKInproceedingsString; } else if ([type isEqualToString:@"report"]) { BibTeXType = BDSKTechreportString; } -// else if ([type isEqualToString:@"info:ofi/fmt:kev:mtx:dissertation"]) { BibTeXType = @"phdthesis"; } + // else if ([type isEqualToString:@"info:ofi/fmt:kev:mtx:dissertation"]) { BibTeXType = @"phdthesis"; } return BibTeXType; } @@ -126,9 +125,9 @@ /* - Converts a COins String to a BibItem. - All sorts of heuristics and attempts to interpret the format in there. -*/ + Converts a COins String to a BibItem. + All sorts of heuristics and attempts to interpret the format in there. + */ + (BibItem *) parseCOinSString: (NSString *) COinSString { NSString * inputString = COinSString; if ([inputString rangeOfString:@"%20"].location == NSNotFound) { @@ -141,7 +140,7 @@ if ([components count] < 2 ) { return nil; } NSEnumerator * myEnum = [components objectEnumerator]; NSString * component; - + BibItem * bibItem = [[[BibItem alloc] init] autorelease]; NSString * publicationType = BDSKMiscString; NSString * startPage = nil; @@ -150,165 +149,158 @@ NSString * auLast = nil; NSString * auInitials = nil; NSString * auSuffix = nil; - + while (component = [myEnum nextObject]) { NSArray * keyValue = [component componentsSeparatedByString:@"="]; if ([keyValue count] == 2 ) { NSString * key = [keyValue objectAtIndex:0]; NSString * value = [[(NSString*)[keyValue objectAtIndex:1] stringByReplacingPercentEscapesUsingEncoding:NSUTF8StringEncoding] stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceCharacterSet]]; - + if (value) { - NSString * fieldName = nil; - if ([key isEqualToString:@"rft.genre"]) { - if ([publicationType isEqualToString:BDSKMiscString]) { - publicationType = [self convertType:value]; - } - } - else if ([key isEqualToString:@"rft.atitle"]) { // article title - fieldName = BDSKTitleString; - publicationType = @"article"; - } - else if ([key isEqualToString:@"rft.btitle"]) { // book title - fieldName = BDSKTitleString; - publicationType = @"book"; - } - else if ([key isEqualToString:@"rft.title"]) { // general title - fieldName = BDSKTitleString; - } - else if ([key isEqualToString:@"rft.jtitle"]) { // full journal title - fieldName = BDSKJournalString; - } - else if ([key isEqualToString:@"rft.stitle"]) { // short journal title: only use it if no full journal title is present - if ([[bibItem valueOfField:BDSKJournalString] length] == 0) { - fieldName = BDSKJournalString; - } - } - else if ([key isEqualToString:@"rft.series"]) { - fieldName = BDSKSeriesString; - } - else if ([key isEqualToString:@"rft.au"]) { // this simplistic approach hopes that .au is used rather than .aufirst .aulast etc. - fieldName = BDSKAuthorString; - } - else if ([key isEqualToString:@"rft.aufirst"]) { - auFirst = value; - } - else if ([key isEqualToString:@"rft.aulast"]) { - auLast = value; - } - else if ([key isEqualToString:@"rft.auinit"] || [key isEqualToString:@"rft.auinitm"]) { - if ( auInitials ) { - // append to existing initials - auInitials = [auInitials stringByAppendingFormat:@" %@", value]; - } - else { - auInitials = value; - } - } - else if ([key isEqualToString:@"rft.auinit1"]) { - if ( auInitials ) { - // prepend to existing initials - auInitials = [NSString stringWithFormat:@"%@ %@", value, auInitials]; - } - else { - auInitials = value; - } - } - else if ([key isEqualToString:@"rft.auSuffix"]) { - auSuffix = value; - } - else if ([key isEqualToString:@"rft.date"]) { - // try to find a four digit year, otherwise leave fieldName nil - // add support for months? - AGRegex * yearRegexp = [AGRegex regexWithPattern:@"[0-9]{4}"]; - AGRegexMatch * match = [yearRegexp findInString:value]; - if (match) { + NSString * fieldName = nil; + if ([key isEqualToString:@"rft.genre"]) { + if ([publicationType isEqualToString:BDSKMiscString]) { + publicationType = [self convertType:value]; + } + } + else if ([key isEqualToString:@"rft.atitle"]) { // article title + fieldName = BDSKTitleString; + publicationType = @"article"; + } + else if ([key isEqualToString:@"rft.btitle"]) { // book title + fieldName = BDSKTitleString; + publicationType = @"book"; + } + else if ([key isEqualToString:@"rft.title"]) { // general title + fieldName = BDSKTitleString; + } + else if ([key isEqualToString:@"rft.jtitle"]) { // full journal title + fieldName = BDSKJournalString; + } + else if ([key isEqualToString:@"rft.stitle"]) { // short journal title: only use it if no full journal title is present + if ([[bibItem valueOfField:BDSKJournalString] length] == 0) { + fieldName = BDSKJournalString; + } + } + else if ([key isEqualToString:@"rft.series"]) { + fieldName = BDSKSeriesString; + } + else if ([key isEqualToString:@"rft.au"]) { // this simplistic approach hopes that .au is used rather than .aufirst .aulast etc. + fieldName = BDSKAuthorString; + } + else if ([key isEqualToString:@"rft.aufirst"]) { + auFirst = value; + } + else if ([key isEqualToString:@"rft.aulast"]) { + auLast = value; + } + else if ([key isEqualToString:@"rft.auinit"] || [key isEqualToString:@"rft.auinitm"]) { + if ( auInitials ) { + // append to existing initials + auInitials = [auInitials stringByAppendingFormat:@" %@", value]; + } + else { + auInitials = value; + } + } + else if ([key isEqualToString:@"rft.auinit1"]) { + if ( auInitials ) { + // prepend to existing initials + auInitials = [NSString stringWithFormat:@"%@ %@", value, auInitials]; + } + else { + auInitials = value; + } + } + else if ([key isEqualToString:@"rft.auSuffix"]) { + auSuffix = value; + } + else if ([key isEqualToString:@"rft.date"]) { + // try to find a four digit year, otherwise leave fieldName nil + // add support for months? + AGRegex * yearRegexp = [AGRegex regexWithPattern:@"[0-9]{4}"]; + AGRegexMatch * match = [yearRegexp findInString:value]; + if (match) { value = [match group]; fieldName = BDSKYearString; - } - } - else if ([key isEqualToString:@"rft.pub"]) { // publisher - fieldName = BDSKPublisherString; - } - else if ([key isEqualToString:@"rft.place"]) { - fieldName = BDSKAddressString; - } - else if ([key isEqualToString:@"rft.edition"]) { - fieldName = @"Edition"; - } - else if ([key isEqualToString:@"rft.volume"]) { - fieldName = BDSKVolumeString; - } - else if ([key isEqualToString:@"rft.issue"]) { - fieldName = BDSKNumberString; - } - else if ([key isEqualToString:@"rft.pages"] || [key isEqualToString:@"rft.tpages"]) { - fieldName = BDSKPagesString; - } - else if ([key isEqualToString:@"rft.spage"]) { // start page - startPage = value; - } - else if ([key isEqualToString:@"rft.epage"]) { // end page - endPage = value; - } - else if ([key isEqualToString:@"rft_id"] || [key isEqualToString:@"rft.identifier"]) { - // these are most likely URLs or DOI type information - NSURL * URL = [NSURL URLWithString:value]; - if (URL) { - if ( [[URL scheme] rangeOfString:@"http" options:NSLiteralSearch].location != NSNotFound ) { - // add http/https URLs to the FileView items only, rather than the Url field. This lets us process more than one of them and avoid adding links to library catalogue entries to the BibTeX record. I haven't seen other usable URL typese yet. - [bibItem addFileForURL:URL autoFile:NO runScriptHook:NO]; - } - } - else { - // it's not a URL, what now? ignore? - } - if ([value rangeOfString:@"doi" options:NSCaseInsensitiveSearch].location != NSNotFound) { - // the value contains doi, so assume it's DOI information and also add it to the DOI field. There should only be a single occurrence of those, so add it right here to make sure the format isn't messed up in case multiple fields contain that substring - AGRegex * DOIRegex = [AGRegex regexWithPattern:@"10.[0-9/.%a-zA-Z]+" options:0]; - AGRegexMatch * match = [DOIRegex findInString:value]; - if (match) { - NSString * DOI = [match group]; - [bibItem setField:BDSKDoiString toValue:DOI]; - } - } - } - else if ([key isEqualToString:@"rft.isbn"]) { - fieldName = @"ISBN"; - } - else if ([key isEqualToString:@"rft.issn"]) { - fieldName = @"ISSN"; - } - else if ([key isEqualToString:@"rft.aucorp"]) { - fieldName = BDSKInstitutionString; - } - else if ([key isEqualToString:@"rft.description"]) { // ? - fieldName = @"Comments"; - } - - // ignored items which apparently may exist: rft.artnum (kind of ID), rft.part, rft.coden (no clue), rft.sici (no clue), rft.chron (free-style dates), rft.ssn (Seasonal Dates), rft.quarter - - if ( fieldName ) { - NSString * previousValue = [bibItem valueOfField:fieldName]; - BOOL wasNonEmpty = ([previousValue length] > 0); - - /* now treat a few cases specially */ - if ( [fieldName isEqualToString:BDSKAuthorString] && wasNonEmpty ) { - // if author already exists, append another one with an 'and' separator - value = [previousValue stringByAppendingFormat:@" and %@", value]; - } - else if ( wasNonEmpty ) { - // for other values append multiple occurrencs with a semicolon as a separator, make sure the new string is not contained in the existing string already before adding it as sometimes fields end up twice in the COinS record - if ( [previousValue rangeOfString:value options:NSLiteralSearch].location == NSNotFound ) { - value = [previousValue stringByAppendingFormat:@"; %@", value]; - } - } - - if (value) { - [bibItem setField:fieldName toValue:value]; - } - } - + } + } + else if ([key isEqualToString:@"rft.pub"]) { // publisher + fieldName = BDSKPublisherString; + } + else if ([key isEqualToString:@"rft.place"]) { + fieldName = BDSKAddressString; + } + else if ([key isEqualToString:@"rft.edition"]) { + fieldName = @"Edition"; + } + else if ([key isEqualToString:@"rft.volume"]) { + fieldName = BDSKVolumeString; + } + else if ([key isEqualToString:@"rft.issue"]) { + fieldName = BDSKNumberString; + } + else if ([key isEqualToString:@"rft.pages"] || [key isEqualToString:@"rft.tpages"]) { + fieldName = BDSKPagesString; + } + else if ([key isEqualToString:@"rft.spage"]) { // start page + startPage = value; + } + else if ([key isEqualToString:@"rft.epage"]) { // end page + endPage = value; + } + else if ([key isEqualToString:@"rft_id"] || [key isEqualToString:@"rft.identifier"]) { + // these are most likely URLs or DOI type information + NSURL * URL = [NSURL URLWithString:value]; + if (URL && [[URL scheme] rangeOfString:@"http" options:NSLiteralSearch].location != NSNotFound) { + // add http/https URLs to the FileView items only, rather than the Url field. This lets us process more than one of them and avoid adding links to library catalogue entries to the BibTeX record. I haven't seen other usable URL typese yet. + [bibItem addFileForURL:URL autoFile:NO runScriptHook:NO]; + } + + if ([value rangeOfString:@"doi" options:NSCaseInsensitiveSearch].location != NSNotFound) { + // the value contains doi, so assume it's DOI information and also add it to the DOI field. There should only be a single occurrence of those, so add it right here to make sure the format isn't messed up in case multiple fields contain that substring + NSRange range = [value rangeOfCharacterFromSet:[NSCharacterSet decimalDigitCharacterSet]]; + if (range.length && range.location > 0) + [bibItem setField:BDSKDoiString toValue:[value substringFromIndex:range.location]]; + } + } + else if ([key isEqualToString:@"rft.isbn"]) { + fieldName = @"ISBN"; + } + else if ([key isEqualToString:@"rft.issn"]) { + fieldName = @"ISSN"; + } + else if ([key isEqualToString:@"rft.aucorp"]) { + fieldName = BDSKInstitutionString; + } + else if ([key isEqualToString:@"rft.description"]) { // ? + fieldName = @"Comments"; + } + + // ignored items which apparently may exist: rft.artnum (kind of ID), rft.part, rft.coden (no clue), rft.sici (no clue), rft.chron (free-style dates), rft.ssn (Seasonal Dates), rft.quarter + + if ( fieldName ) { + NSString * previousValue = [bibItem valueOfField:fieldName]; + BOOL wasNonEmpty = ([previousValue length] > 0); + + /* now treat a few cases specially */ + if ( [fieldName isEqualToString:BDSKAuthorString] && wasNonEmpty ) { + // if author already exists, append another one with an 'and' separator + value = [previousValue stringByAppendingFormat:@" and %@", value]; + } + else if ( wasNonEmpty ) { + // for other values append multiple occurrencs with a semicolon as a separator, make sure the new string is not contained in the existing string already before adding it as sometimes fields end up twice in the COinS record + if ( [previousValue rangeOfString:value options:NSLiteralSearch].location == NSNotFound ) { + value = [previousValue stringByAppendingFormat:@"; %@", value]; + } + } + + if (value) { + [bibItem setField:fieldName toValue:value]; + } + } + } } @@ -347,21 +339,21 @@ } [bibItem setPubType:publicationType]; - + return bibItem; } /* - Process the document. -*/ + Process the document. + */ + (NSArray *)itemsFromDocument:(DOMDocument *)domDocument xmlDocument:(NSXMLDocument *)xmlDocument fromURL:(NSURL *)url error:(NSError **)outError{ NSArray * entries = [BDSKCOinSParser Z3988MatchesForDocument:xmlDocument]; NSString * entry; NSEnumerator * myEnum = [entries objectEnumerator]; NSMutableArray * results = [NSMutableArray arrayWithCapacity:[entries count]]; - + while (entry = [myEnum nextObject]) { BibItem * bibItem = [BDSKCOinSParser parseCOinSString:entry]; if (bibItem) { @@ -373,16 +365,4 @@ } - -/* - Array with feature description dictionary for the COinS microformat. -*/ -+ (NSArray *) parserInfos { - NSString * parserDescription = NSLocalizedString(@"The COinS microformat can be used to embed bibliographic information in web pages.", @"Description for COinS mircoformat"); - NSDictionary * parserInfo = [BDSKWebParser parserInfoWithName:@"COinS" address:@"http://ocoins.info/" description: parserDescription flags: BDSKParserFeatureAllPagesMask]; - - return [NSArray arrayWithObject:parserInfo]; -} - - @end