Revision: 10928
http://bibdesk.svn.sourceforge.net/bibdesk/?rev=10928&view=rev
Author: amaxwell
Date: 2007-08-16 21:05:51 -0700 (Thu, 16 Aug 2007)
Log Message:
-----------
Revise field handling for SciFinder parsing. Dump most of the
non-BibTeX fields in Annote, since they're usually not useful,
according to Holger. This can be revisited if someone complains.
Modified Paths:
--------------
trunk/bibdesk/BDSKSciFinderParser.m
Modified: trunk/bibdesk/BDSKSciFinderParser.m
===================================================================
--- trunk/bibdesk/BDSKSciFinderParser.m 2007-08-16 23:33:22 UTC (rev 10927)
+++ trunk/bibdesk/BDSKSciFinderParser.m 2007-08-17 04:05:51 UTC (rev 10928)
@@ -58,25 +58,44 @@
// advance range past the ":"
r.location += 1;
*value = (id)CFStringCreateWithSubstring(alloc, (CFStringRef)line,
CFRangeMake(r.location, len - r.location));
- return YES;
+
+ // just checking length may not be sufficient; some entries have a
single space past the colon
+ if ([*value rangeOfCharacterFromSet:[NSCharacterSet
alphanumericCharacterSet]].length)
+ return YES;
+ // no meaningful characters, so release and return NO
+ [*key release];
+ [*value release];
}
return NO;
}
+static NSSet *correctFields = nil;
+
++ (void)initialize
+{
+ OBINITIALIZE;
+ correctFields = [[NSSet alloc] initWithObjects:BDSKVolumeString,
@"Language", BDSKAbstractString, nil];
+}
+
// some more-or-less unique string that meets our field name criteria (leading
cap, no space)
-static NSString *__documentTypeString = @"Document-Type";
+static NSString *__documentTypeString = @"Doc-Type";
static void fixAndAddKeyValueToDictionary(NSString *key, NSString *value,
NSMutableDictionary *pubFields)
-{
- // @@ most of this needs to be replaced by TypeInfo.plist dictionaries
- static NSCharacterSet *replaceChars = nil;
- if (nil == replaceChars) {
- replaceChars = [[NSCharacterSet characterSetWithCharactersInString:@"
."] copy];
+{
+ // We could move some of this into TypeInfo.plist, but we only have three
fields that don't need special handling, so it's not really worthwhile. This
function has multiple early returns, so be careful when debugging.
+
+ if ([key isEqualToString:BDSKAuthorString]) {
+ value = [value stringByReplacingAllOccurrencesOfString:@"; "
withString:@" and "];
}
- if ([key isEqualToString:@"Author"])
- value = [value stringByReplacingAllOccurrencesOfString:@"; "
withString:@" and "];
- else if ([key isEqualToString:@"Journal Title"])
+ else if ([key isEqualToString:@"Full Journal Title"]) {
key = BDSKJournalString;
+ }
+ else if ([key isEqualToString:@"Journal Title"]) {
+ key = BDSKJournalString;
+ // if we already have a Journal definition, bail out, because it's
from "Full Journal Title"
+ if ([pubFields objectForKey:key] != nil)
+ return;
+ }
else if ([key isEqualToString:@"Document Type"]) {
// parse this here and add to the dictionary, to be removed later when
we match it up with a BibTeX type
NSRange r = [value rangeOfString:@";"];
@@ -84,19 +103,47 @@
value = [value substringWithRange:NSMakeRange(0, r.location)];
key = __documentTypeString;
}
- else if ([key isEqualToString:@"Publication Year"])
+ else if ([key isEqualToString:@"Publication Year"]) {
key = BDSKYearString;
+ }
+ else if ([key isEqualToString:@"Publication Date"]) {
+ // user says that one database uses Publication Year, and the other
uses Publication Date, and recommends we prefer year
+ key = BDSKYearString;
+ if ([pubFields objectForKey:key] == nil)
+ return;
+ }
+ else if ([key isEqualToString:@"Corporate Source"]) {
+ key = BDSKAddressString;
+ }
else if ([key isEqualToString:@"Page"]) {
key = BDSKPagesString;
if ([value rangeOfString:@"--"].location == NSNotFound)
value = [value stringByReplacingAllOccurrencesOfString:@"-"
withString:@"--"];
}
- else if ([key isEqualToString:@"Issue"])
+ else if ([key isEqualToString:@"Issue"]) {
key = BDSKNumberString;
- else if ([key isEqualToString:@"Title"] && [value hasSuffix:@"."]) // many
entries seem to have a trailing "." on the title
- value = [value stringByRemovingSuffix:@"."];
- else if ([key rangeOfCharacterFromSet:replaceChars].length)
- key = [key stringByReplacingCharactersInSet:replaceChars
withString:@"-"];
+ }
+ else if ([key isEqualToString:BDSKTitleString]) {
+ // many entries seem to have a trailing "." on the title
+ if ([value hasSuffix:@"."])
+ value = [value stringByRemovingSuffix:@"."];
+ }
+ else if ([key isEqualToString:@"Index Terms"]) {
+ // stick Index Terms(2) and Supplementary Terms in annote; user says
they're generally garbage
+ key = BDSKKeywordsString;
+ }
+ else if ([correctFields containsObject:key] == NO) {
+ // this is a field that isn't meaningful, so dump it into Annote
+ NSMutableString *mutString = [pubFields objectForKey:BDSKAnnoteString];
+ if (nil == mutString) {
+ mutString = [NSMutableString string];
+ [pubFields setObject:mutString forKey:BDSKAnnoteString];
+ }
+ [mutString appendFormat:@"%@:[EMAIL PROTECTED]", key, value];
+
+ // bail out instead of adding to the dictionary
+ return;
+ }
[pubFields setObject:[value stringByBackslashEscapingTeXSpecials]
forKey:[key fieldName]];
}
@@ -147,7 +194,7 @@
NSString *value;
// lots of keys have empty values, so check the return value of
this method
- // some fields also seem to be continued, as "Index Terms" and
"Index Terms(2)"; not clear how to handle those yet
+ // some fields also seem to be continued, but those end up getting
dumped into Annote
if ([self copyKey:&key value:&value fromLine:line]) {
fixAndAddKeyValueToDictionary(key, value, pubFields);
[key release];
@@ -158,15 +205,15 @@
if ([pubFields count]) {
NSString *type = [pubFields objectForKey:__documentTypeString];
- // leave Document-Type as a field if we don't have a precise
mapping
+ // leave Doc-Type as a field if we don't have a precise mapping
if ([type isEqualToString:@"Journal"]) {
type = BDSKArticleString;
[pubFields removeObjectForKey:__documentTypeString];
}else if ([type isEqualToString:@"Preprint"]) {
- // preprint is most likely an article type...but unpublished
is probably better
+ // preprint is most likely an article type...but unpublished
is probably more correct
type = BDSKUnpublishedString;
}else if ([type isEqualToString:@"Report"]) {
- // this should be more accurate than "Journal", but
unfortunately all types are described with the same keys
+ // techreport should be more correct than journal, but
unfortunately all types are described with the same keys
if ([pubFields objectForKey:BDSKJournalString]) {
[pubFields setObject:[pubFields
objectForKey:BDSKJournalString] forKey:BDSKInstitutionString];
[pubFields removeObjectForKey:BDSKJournalString];
@@ -174,8 +221,8 @@
type = BDSKTechreportString;
[pubFields removeObjectForKey:__documentTypeString];
}else {
- // the only other type I've seen so far is patent, which
BibTeX doesn't have
- type = BDSKMiscString;
+ // SciFinder fields basically force everything to be an
@article
+ type = BDSKArticleString;
}
BibItem *pub = [[BibItem alloc] initWithType:type
fileType:BDSKBibtexString citeKey:nil pubFields:pubFields isNew:YES];
@@ -196,7 +243,7 @@
http://chemistry.library.wisc.edu/instruction/scifinder_taggedsample.txt
http://wiki.refbase.net/index.php/Import_Example:_SciFinder
- From those, we have the following unique doc types:
+ From those and user-suppled info, we have the following doc types:
FIELD Document Type:Journal; Online Computer File
FIELD Document Type:Patent
@@ -206,6 +253,10 @@
FIELD Document Type:Preprint
FIELD Document Type:Journal; General Review
FIELD Document Type:Report
+ FIELD Document Type:Conference; General Review
+ FIELD Document Type:Conference; Meeting Abstract; Computer Optical Disk
+ FIELD Document Type:Conference
+ FIELD Document Type:Journal; Article; (JOURNAL ARTICLE); (RESEARCH SUPPORT,
NON-U.S. GOV'T)
so it looks like we want to grab the first word/phrase before the semicolon,
or to the end of the line, whichever is shorter. Entries appear to have a
maximum of 49 lines.
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems? Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
_______________________________________________
Bibdesk-commit mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/bibdesk-commit