Revision: 28120
          http://sourceforge.net/p/bibdesk/svn/28120
Author:   hofman
Date:     2022-12-15 17:26:56 +0000 (Thu, 15 Dec 2022)
Log Message:
-----------
Rewrite IACR web parser. The site pages have significantly changed. Get bibtex 
from preelement ofarticle pages. For search or day results, download individual 
paper pages.

Modified Paths:
--------------
    trunk/bibdesk/BDSKIACRParser.h
    trunk/bibdesk/BDSKIACRParser.m

Modified: trunk/bibdesk/BDSKIACRParser.h
===================================================================
--- trunk/bibdesk/BDSKIACRParser.h      2022-12-15 15:14:02 UTC (rev 28119)
+++ trunk/bibdesk/BDSKIACRParser.h      2022-12-15 17:26:56 UTC (rev 28120)
@@ -37,8 +37,8 @@
  */
 
 #import <Cocoa/Cocoa.h>
-#import "BDSKWebParser.h"
+#import "BDSKAsynchronousWebParser.h"
 
 
-@interface BDSKIACRParser : BDSKWebParser
+@interface BDSKIACRParser : BDSKAsynchronousWebParser
 @end

Modified: trunk/bibdesk/BDSKIACRParser.m
===================================================================
--- trunk/bibdesk/BDSKIACRParser.m      2022-12-15 15:14:02 UTC (rev 28119)
+++ trunk/bibdesk/BDSKIACRParser.m      2022-12-15 17:26:56 UTC (rev 28120)
@@ -40,6 +40,7 @@
 #import "BibItem.h"
 #import "DOMNode_BDSKExtensions.h"
 #import "NSURL_BDSKExtensions.h"
+#import "NSString_BDSKExtensions.h"
 #import <AGRegex/AGRegex.h>
 
 
@@ -50,100 +51,102 @@
     if ([url hasDomain:@"eprint.iacr.org"] == NO)
         return NO;
     
-    if ([[[url path] lowercaseString] isEqualToString:@"/cgi-bin/search.pl"])
+    if ([url hasFirstPathComponent:@"search"] || [url 
hasFirstPathComponent:@"days"])
         return YES;
-    
+
     AGRegex *absRegex = [AGRegex regexWithPattern:@"^/[0-9]{4}/[0-9]+$"];
        
     if ([absRegex findInString:[url path]])
         return YES;
     
+    DOMNode *node = [[domDocument documentElement] singleNodeForXPath:[self 
citationNodeXPath]];
+    
+    if (node)
+        return YES;
+    
        return NO;
        
 }
 
++ (NSString *)citationNodeXPath { return @"./body//a[@class='paperlink']"; }
+
 - (NSArray *)itemsReturningError:(NSError **)outError {
-
-    NSMutableArray *items = [NSMutableArray array];
-       
     NSURL *url = [self URL];
+    DOMElement *rootElement = [[self domDocument] documentElement];
+    AGRegex *yrnRegex = [AGRegex regexWithPattern:@"^/([0-9]{4})/([0-9]+)$"];
+    AGRegexMatch *yrnMatch = [yrnRegex findInString:[url path]];
+    
+    if (yrnMatch) {
+        // individual article
+        
+        DOMNode *bibtexNode = [rootElement 
singleNodeForXPath:@"./body//pre[@id='bibtex']"];
+        
+        if (bibtexNode) {
+            NSString *bibtexString = [bibtexNode stringValue];
+            BibItem *item = [[self itemsFromBibTeXString:bibtexString 
error:NULL] firstObject];
+            
+            if (item) {
+                NSString *baseURLString = [url absoluteString];
+                [item setField:BDSKUrlString toValue:nil];
+                [item addURLString:[baseURLString 
stringByAppendingPathExtension:@"pdf"]];
+                [item addURLString:baseURLString];
+                
+                return [NSArray arrayWithObjects:item, nil];
+            }
+        }
+        
+    } else {
+        // search results or articles of previous days
+        
+        NSString *paperNodeXPath = @"./body//a[@class='paperlink']";
+        NSArray *paperNodes = [rootElement nodesForXPath:[[self class] 
citationNodeXPath]];
+        
+        for (DOMNode *paperNode in paperNodes) {
+            NSString *path = [paperNode stringValueOfAttribute:@"href"];
+            yrnMatch = [yrnRegex findInString:path];
+            NSString *year = [yrnMatch groupAtIndex:1];
+            NSString *reportNum = [yrnMatch groupAtIndex:2];
+            
+            NSURL *bibtexURL = [[NSURL URLWithString:path relativeToURL:url] 
absoluteURL];
+            NSURLRequest *request = [NSURLRequest requestWithURL:bibtexURL];
+            NSDictionary *contextInfo = [NSDictionary 
dictionaryWithObjectsAndKeys:[bibtexURL absoluteString], @"baseURLString", nil];
+            
+            [self addDownloadWithRequest:request contextInfo:contextInfo];
+        }
+        
+    }
+    
+    return nil;
+}
 
-       // is this a search results page or an individual article?
-       BOOL isSearch = [[[url path] lowercaseString] 
isEqualToString:@"/cgi-bin/search.pl"];
+- (NSArray *)itemsFromDownload:(BDSKCitationDownload *)download error:(NSError 
**)outError {
+    NSXMLDocument *xmlDoc = [[[NSXMLDocument alloc] initWithData:[download 
data] options:NSXMLDocumentTidyHTML error:outError] autorelease];
     
-       // construct the source item(s) to parse
-       NSArray *sources = nil;
-    DOMElement *rootElement = [[self domDocument] documentElement];
-    if (isSearch)
-        sources = [rootElement nodesForXPath:@"./body//dt"];
-    else
-        sources = [NSArray arrayWithObjects:rootElement, nil];
-       
-    if ([sources count] == 0)
+    if (xmlDoc == nil)
         return nil;
     
-    DOMXPathExpression *titleNodePath = nil;
-    DOMXPathExpression *authorNodePath = nil;
-    DOMXPathExpression *pathToSearchNodePath = nil;
+    NSXMLNode *bibtexNode = [[[xmlDoc rootElement] 
nodesForXPath:@"./body//pre[@id='bibtex']" error:NULL] firstObject];
     
-    if (isSearch) {
-        titleNodePath = [[self domDocument] 
createExpression:@"following-sibling::dd/b" resolver:nil];
-        authorNodePath = [[self domDocument] 
createExpression:@"following-sibling::dd[position()=2]/em" resolver:nil];
-        pathToSearchNodePath = [[self domDocument] 
createExpression:@".//a/@href" resolver:nil];
-    } else {
-        titleNodePath = [[self domDocument] createExpression:@".//b" 
resolver:nil];
-        authorNodePath = [[self domDocument] createExpression:@".//i" 
resolver:nil];
-    }
-    
-    for (DOMNode *sourceNode in sources) {
-               
-               NSMutableDictionary *pubFields = [NSMutableDictionary 
dictionary];
-               NSArray *urlsArray = nil;
-        NSString *pathToSearch = nil;
-        DOMNode *node;
-        NSString *string;
+    if (bibtexNode) {
+        NSString *baseURLString = [[download contextInfo] 
objectForKey:@"baseURLString"];
+        AGRegex *yrnRegex = [AGRegex 
regexWithPattern:@"^/([0-9]{4})/([0-9]+)$"];
+        AGRegexMatch *yrnMatch = [yrnRegex findInString:[[NSURL 
URLWithString:baseURLString] path]];
+        NSString *year = [yrnMatch groupAtIndex:1];
+        NSString *reportNum = [yrnMatch groupAtIndex:2];
         
-        // set title
-        node = [sourceNode singleNodeForXPathExpression:titleNodePath];
-        if ((string = [node stringValue]))
-            [pubFields setObject:string forKey:BDSKTitleString];
-        // set authors
-        node = [sourceNode singleNodeForXPathExpression:authorNodePath];
-        if ((string = [node stringValue]))
-            [pubFields setObject:string forKey:BDSKAuthorString];
-        // to get year and report number
-        if (pathToSearchNodePath) {
-            node = [sourceNode 
singleNodeForXPathExpression:pathToSearchNodePath];
-            if ((string = [node stringValue]))
-                pathToSearch = string;
-        } else {
-            pathToSearch = [url path];
+        NSString *bibtexString = [[bibtexNode stringValue] 
stringByTrimmingCharactersInSet:[NSCharacterSet 
whitespaceAndNewlineCharacterSet]];
+        BibItem *item = [[self itemsFromBibTeXString:bibtexString error:NULL] 
firstObject];
+        
+        if (item) {
+            [item setField:BDSKUrlString toValue:nil];
+            [item addURLString:[baseURLString 
stringByAppendingPathExtension:@"pdf"]];
+            [item addURLString:baseURLString];
         }
         
-        // compute year and report number
-        AGRegex *yrnRegex = [AGRegex 
regexWithPattern:@"^/([0-9]{4})/([0-9]+)$"];
-               AGRegexMatch *yrnMatch = [yrnRegex findInString:pathToSearch];
-               NSString *year = [yrnMatch groupAtIndex:1];
-               NSString *reportNum = [yrnMatch groupAtIndex:2];
-        NSString *urlBaseString = [NSString stringWithFormat:@"%@://%@/%@/%@", 
[url scheme], [url host], year, reportNum];
-
-               // set year, report number, PDF url, eprint
-               if ((year != nil) && (reportNum != nil)) {
-                       [pubFields setValue:year forKey:BDSKYearString];
-                       [pubFields setValue:[NSString 
stringWithFormat:@"Cryptology ePrint Archive, Report %@/%@", year, reportNum] 
forKey:@"Note"];
-            urlsArray = [NSArray arrayWithObjects:
-                    [urlBaseString stringByAppendingPathExtension:@"pdf"], 
urlBaseString, nil];
-                       [pubFields setValue:[NSString 
stringWithFormat:@"\\url{%@}", urlBaseString] forKey:@"Eprint"];
-               }
-
-               // add item
-               BibItem *item = [[BibItem alloc] initWithType:BDSKMiscString 
citeKey:nil pubFields:pubFields URLStrings:urlsArray];
-               [items addObject:item];
-               [item release];
-                       
-       }
-       
-    return items;
+        return [NSArray arrayWithObjects:item, nil];
+    }
+    
+    return nil;
 }
 
 + (NSString *)address { return @"https://eprint.iacr.org/";; }

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.



_______________________________________________
Bibdesk-commit mailing list
Bibdesk-commit@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/bibdesk-commit

Reply via email to