[ 
https://jira.duraspace.org/browse/DS-1030?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=22477#comment-22477
 ] 

Peter Dietz edited comment on DS-1030 at 9/16/11 8:23 PM:
----------------------------------------------------------

My local version has been extensively modified, and so this mockup isn't a 
clean diff-against-trunk, but it should be clear to see what needs to be 
changed. When I have more time, I'll attach a full/clean patch.

{code}
    public static void markRobotsByIP()
    {
        try {
            for(String ip : SpiderDetector.getSpiderIpAddresses()){
                /* Result Process to alter record to be identified as a bot */
                // Changed to using Impl block below
                ResultProcessor processor = new ResultProcessorDeleteAddImpl();

                /* query for ip, exclude results previously set as bots. */
                if (ip.matches("[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+"))  {
                    // Full 4 octet string, run as-is.
                    processor.execute("ip:" + ip + " AND -isBot:true");
                } else if (ip.matches(".*\\.$"))  {
                    // didn't match full-octet, but ends in period, we assume 
it was something like #.#.#. or #.#.
                    processor.execute("ip:" + ip + "* AND -isBot:true");
                } else if (ip.matches(".*[0-9]$"))  {
                    // ends with a number, and is not a full 4-octet as first 
entry, so we append .*
                    processor.execute("ip:" + ip + ".* AND -isBot:true");
                } else {
                    log.error("Unexpected IP value: " + ip);
                }
            }
            solr.commit();
        } catch (Exception e) {
                log.error(e.getMessage(),e);
        }
    }



    private static class ResultProcessorDeleteAddImpl extends ResultProcessor {

        public ResultProcessorDeleteAddImpl() {
        }

        public void process(SolrDocument doc) throws IOException, 
SolrServerException {
            doc.removeFields("isBot");
            doc.addField("isBot", true);
            SolrInputDocument newInput = ClientUtils.toSolrInputDocument(doc);
            Integer type = (Integer) doc.getFieldValue("type");
            Integer id = (Integer) doc.getFieldValue("id");
            String ip = (String) doc.getFieldValue("ip");

            String time = 
DateFormatUtils.formatUTC((Date)doc.getFieldValue("time"), 
SolrLogger.DATE_FORMAT_8601);

            //Uniquely remove previous entry. Should be safe to assume only one 
request to a specified resource by a single user per millisecond.
            solr.deleteByQuery("type:" + type + " AND id:" + id + " AND ip:" + 
ip + " AND time:[" + time + " TO " + time +"]");

            solr.add(newInput);
            log.info("Marked " + doc.getFieldValue("ip") + " as bot");
        }
    }



{code}

EDITED: Tried to make the code-paste format readable.
                
      was (Author: peterdietz):
    My local version has been extensively modified, and so this mockup isn't a 
clean diff-against-trunk, but it should be clear to see what needs to be 
changed. When I have more time, I'll attach a full/clean patch.

{noformat}
    public static void markRobotsByIP()
    {
        try {
            for(String ip : SpiderDetector.getSpiderIpAddresses()){
                /* Result Process to alter record to be identified as a bot */
                // Changed to using Impl block below
                ResultProcessor processor = new ResultProcessorDeleteAddImpl();

                /* query for ip, exclude results previously set as bots. */
                if (ip.matches("[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+"))  {
                    // Full 4 octet string, run as-is.
                    processor.execute("ip:" + ip + " AND -isBot:true");
                } else if (ip.matches(".*\\.$"))  {
                    // didn't match full-octet, but ends in period, we assume 
it was something like #.#.#. or #.#.
                    processor.execute("ip:" + ip + "* AND -isBot:true");
                } else if (ip.matches(".*[0-9]$"))  {
                    // ends with a number, and is not a full 4-octet as first 
entry, so we append .*
                    processor.execute("ip:" + ip + ".* AND -isBot:true");
                } else {
                    log.error("Unexpected IP value: " + ip);
                }
            }
            solr.commit();
        } catch (Exception e) {
                log.error(e.getMessage(),e);
        }
    }



    private static class ResultProcessorDeleteAddImpl extends ResultProcessor {

        public ResultProcessorDeleteAddImpl() {
        }

        public void process(SolrDocument doc) throws IOException, 
SolrServerException {
            doc.removeFields("isBot");
            doc.addField("isBot", true);
            SolrInputDocument newInput = ClientUtils.toSolrInputDocument(doc);
            Integer type = (Integer) doc.getFieldValue("type");
            Integer id = (Integer) doc.getFieldValue("id");
            String ip = (String) doc.getFieldValue("ip");

            String time = 
DateFormatUtils.formatUTC((Date)doc.getFieldValue("time"), 
SolrLogger.DATE_FORMAT_8601);

            //Uniquely remove previous entry. Should be safe to assume only one 
request to a specified resource by a single user per millisecond.
            solr.deleteByQuery("type:" + type + " AND id:" + id + " AND ip:" + 
ip + " AND time:[" + time + " TO " + time +"]");

            solr.add(newInput);
            log.info("Marked " + doc.getFieldValue("ip") + " as bot");
        }
    }



{noformat}


                  
> markRobotsByIP doesn't remove isBot:false records
> -------------------------------------------------
>
>                 Key: DS-1030
>                 URL: https://jira.duraspace.org/browse/DS-1030
>             Project: DSpace
>          Issue Type: Bug
>          Components: Solr
>    Affects Versions: 1.7.2
>            Reporter: Dan Ishimitsu
>            Assignee: Peter Dietz
>
> The expectation based on docs is that /dspace/bin/dspace stats-util -m would 
> update isBot:false records to isBot:true (based on IPs in spider configs).
> It appears to instead create duplicate records with the isBot:true. So we end 
> up with all of the original isBot:false records, plus an equal number of new 
> isBot:true records. I think it's just missing a delete query at the end to 
> clear the old records matching IPs with isBot:false.

--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators: 
https://jira.duraspace.org/secure/ContactAdministrators!default.jspa
For more information on JIRA, see: http://www.atlassian.com/software/jira

        

------------------------------------------------------------------------------
BlackBerry® DevCon Americas, Oct. 18-20, San Francisco, CA
http://p.sf.net/sfu/rim-devcon-copy2
_______________________________________________
Dspace-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/dspace-devel

Reply via email to