Revision: 28120 http://sourceforge.net/p/bibdesk/svn/28120 Author: hofman Date: 2022-12-15 17:26:56 +0000 (Thu, 15 Dec 2022) Log Message: ----------- Rewrite IACR web parser. The site pages have significantly changed. Get bibtex from preelement ofarticle pages. For search or day results, download individual paper pages.
Modified Paths: -------------- trunk/bibdesk/BDSKIACRParser.h trunk/bibdesk/BDSKIACRParser.m Modified: trunk/bibdesk/BDSKIACRParser.h =================================================================== --- trunk/bibdesk/BDSKIACRParser.h 2022-12-15 15:14:02 UTC (rev 28119) +++ trunk/bibdesk/BDSKIACRParser.h 2022-12-15 17:26:56 UTC (rev 28120) @@ -37,8 +37,8 @@ */ #import <Cocoa/Cocoa.h> -#import "BDSKWebParser.h" +#import "BDSKAsynchronousWebParser.h" -@interface BDSKIACRParser : BDSKWebParser +@interface BDSKIACRParser : BDSKAsynchronousWebParser @end Modified: trunk/bibdesk/BDSKIACRParser.m =================================================================== --- trunk/bibdesk/BDSKIACRParser.m 2022-12-15 15:14:02 UTC (rev 28119) +++ trunk/bibdesk/BDSKIACRParser.m 2022-12-15 17:26:56 UTC (rev 28120) @@ -40,6 +40,7 @@ #import "BibItem.h" #import "DOMNode_BDSKExtensions.h" #import "NSURL_BDSKExtensions.h" +#import "NSString_BDSKExtensions.h" #import <AGRegex/AGRegex.h> @@ -50,100 +51,102 @@ if ([url hasDomain:@"eprint.iacr.org"] == NO) return NO; - if ([[[url path] lowercaseString] isEqualToString:@"/cgi-bin/search.pl"]) + if ([url hasFirstPathComponent:@"search"] || [url hasFirstPathComponent:@"days"]) return YES; - + AGRegex *absRegex = [AGRegex regexWithPattern:@"^/[0-9]{4}/[0-9]+$"]; if ([absRegex findInString:[url path]]) return YES; + DOMNode *node = [[domDocument documentElement] singleNodeForXPath:[self citationNodeXPath]]; + + if (node) + return YES; + return NO; } ++ (NSString *)citationNodeXPath { return @"./body//a[@class='paperlink']"; } + - (NSArray *)itemsReturningError:(NSError **)outError { - - NSMutableArray *items = [NSMutableArray array]; - NSURL *url = [self URL]; + DOMElement *rootElement = [[self domDocument] documentElement]; + AGRegex *yrnRegex = [AGRegex regexWithPattern:@"^/([0-9]{4})/([0-9]+)$"]; + AGRegexMatch *yrnMatch = [yrnRegex findInString:[url path]]; + + if (yrnMatch) { + // individual article + + DOMNode *bibtexNode = [rootElement singleNodeForXPath:@"./body//pre[@id='bibtex']"]; + + if (bibtexNode) { + NSString *bibtexString = [bibtexNode stringValue]; + BibItem *item = [[self itemsFromBibTeXString:bibtexString error:NULL] firstObject]; + + if (item) { + NSString *baseURLString = [url absoluteString]; + [item setField:BDSKUrlString toValue:nil]; + [item addURLString:[baseURLString stringByAppendingPathExtension:@"pdf"]]; + [item addURLString:baseURLString]; + + return [NSArray arrayWithObjects:item, nil]; + } + } + + } else { + // search results or articles of previous days + + NSString *paperNodeXPath = @"./body//a[@class='paperlink']"; + NSArray *paperNodes = [rootElement nodesForXPath:[[self class] citationNodeXPath]]; + + for (DOMNode *paperNode in paperNodes) { + NSString *path = [paperNode stringValueOfAttribute:@"href"]; + yrnMatch = [yrnRegex findInString:path]; + NSString *year = [yrnMatch groupAtIndex:1]; + NSString *reportNum = [yrnMatch groupAtIndex:2]; + + NSURL *bibtexURL = [[NSURL URLWithString:path relativeToURL:url] absoluteURL]; + NSURLRequest *request = [NSURLRequest requestWithURL:bibtexURL]; + NSDictionary *contextInfo = [NSDictionary dictionaryWithObjectsAndKeys:[bibtexURL absoluteString], @"baseURLString", nil]; + + [self addDownloadWithRequest:request contextInfo:contextInfo]; + } + + } + + return nil; +} - // is this a search results page or an individual article? - BOOL isSearch = [[[url path] lowercaseString] isEqualToString:@"/cgi-bin/search.pl"]; +- (NSArray *)itemsFromDownload:(BDSKCitationDownload *)download error:(NSError **)outError { + NSXMLDocument *xmlDoc = [[[NSXMLDocument alloc] initWithData:[download data] options:NSXMLDocumentTidyHTML error:outError] autorelease]; - // construct the source item(s) to parse - NSArray *sources = nil; - DOMElement *rootElement = [[self domDocument] documentElement]; - if (isSearch) - sources = [rootElement nodesForXPath:@"./body//dt"]; - else - sources = [NSArray arrayWithObjects:rootElement, nil]; - - if ([sources count] == 0) + if (xmlDoc == nil) return nil; - DOMXPathExpression *titleNodePath = nil; - DOMXPathExpression *authorNodePath = nil; - DOMXPathExpression *pathToSearchNodePath = nil; + NSXMLNode *bibtexNode = [[[xmlDoc rootElement] nodesForXPath:@"./body//pre[@id='bibtex']" error:NULL] firstObject]; - if (isSearch) { - titleNodePath = [[self domDocument] createExpression:@"following-sibling::dd/b" resolver:nil]; - authorNodePath = [[self domDocument] createExpression:@"following-sibling::dd[position()=2]/em" resolver:nil]; - pathToSearchNodePath = [[self domDocument] createExpression:@".//a/@href" resolver:nil]; - } else { - titleNodePath = [[self domDocument] createExpression:@".//b" resolver:nil]; - authorNodePath = [[self domDocument] createExpression:@".//i" resolver:nil]; - } - - for (DOMNode *sourceNode in sources) { - - NSMutableDictionary *pubFields = [NSMutableDictionary dictionary]; - NSArray *urlsArray = nil; - NSString *pathToSearch = nil; - DOMNode *node; - NSString *string; + if (bibtexNode) { + NSString *baseURLString = [[download contextInfo] objectForKey:@"baseURLString"]; + AGRegex *yrnRegex = [AGRegex regexWithPattern:@"^/([0-9]{4})/([0-9]+)$"]; + AGRegexMatch *yrnMatch = [yrnRegex findInString:[[NSURL URLWithString:baseURLString] path]]; + NSString *year = [yrnMatch groupAtIndex:1]; + NSString *reportNum = [yrnMatch groupAtIndex:2]; - // set title - node = [sourceNode singleNodeForXPathExpression:titleNodePath]; - if ((string = [node stringValue])) - [pubFields setObject:string forKey:BDSKTitleString]; - // set authors - node = [sourceNode singleNodeForXPathExpression:authorNodePath]; - if ((string = [node stringValue])) - [pubFields setObject:string forKey:BDSKAuthorString]; - // to get year and report number - if (pathToSearchNodePath) { - node = [sourceNode singleNodeForXPathExpression:pathToSearchNodePath]; - if ((string = [node stringValue])) - pathToSearch = string; - } else { - pathToSearch = [url path]; + NSString *bibtexString = [[bibtexNode stringValue] stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]; + BibItem *item = [[self itemsFromBibTeXString:bibtexString error:NULL] firstObject]; + + if (item) { + [item setField:BDSKUrlString toValue:nil]; + [item addURLString:[baseURLString stringByAppendingPathExtension:@"pdf"]]; + [item addURLString:baseURLString]; } - // compute year and report number - AGRegex *yrnRegex = [AGRegex regexWithPattern:@"^/([0-9]{4})/([0-9]+)$"]; - AGRegexMatch *yrnMatch = [yrnRegex findInString:pathToSearch]; - NSString *year = [yrnMatch groupAtIndex:1]; - NSString *reportNum = [yrnMatch groupAtIndex:2]; - NSString *urlBaseString = [NSString stringWithFormat:@"%@://%@/%@/%@", [url scheme], [url host], year, reportNum]; - - // set year, report number, PDF url, eprint - if ((year != nil) && (reportNum != nil)) { - [pubFields setValue:year forKey:BDSKYearString]; - [pubFields setValue:[NSString stringWithFormat:@"Cryptology ePrint Archive, Report %@/%@", year, reportNum] forKey:@"Note"]; - urlsArray = [NSArray arrayWithObjects: - [urlBaseString stringByAppendingPathExtension:@"pdf"], urlBaseString, nil]; - [pubFields setValue:[NSString stringWithFormat:@"\\url{%@}", urlBaseString] forKey:@"Eprint"]; - } - - // add item - BibItem *item = [[BibItem alloc] initWithType:BDSKMiscString citeKey:nil pubFields:pubFields URLStrings:urlsArray]; - [items addObject:item]; - [item release]; - - } - - return items; + return [NSArray arrayWithObjects:item, nil]; + } + + return nil; } + (NSString *)address { return @"https://eprint.iacr.org/"; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. _______________________________________________ Bibdesk-commit mailing list Bibdesk-commit@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/bibdesk-commit