Author: dongy
Date: 2006-06-03 16:53:25 +0000 (Sat, 03 Jun 2006)
New Revision: 9030

Modified:
   trunk/apps/DarknetSpiderBot/DarknetSpiderBot.php
   trunk/apps/DarknetSpiderBot/class/bot.class.php
   trunk/apps/DarknetSpiderBot/setup/darknetspiderbot.sql
Log:
DarknetSpiderBot: Some bugs with regular expressions fixed; New fonctions added 
in order to insert parsed urls into database;

Modified: trunk/apps/DarknetSpiderBot/DarknetSpiderBot.php
===================================================================
--- trunk/apps/DarknetSpiderBot/DarknetSpiderBot.php    2006-06-03 16:07:15 UTC 
(rev 9029)
+++ trunk/apps/DarknetSpiderBot/DarknetSpiderBot.php    2006-06-03 16:53:25 UTC 
(rev 9030)
@@ -9,18 +9,55 @@
 $buffer_file = 'local.html';
 $bot = new bot($fcp_host, $fcp_port, $buffer_file);

+
 $splitedURL['key_type'] = 'SSK';
 $splitedURL['key_value'] = 
'PFeLTa1si2Ml5sDeUy7eDhPso6TPdmw-2gWfQ4Jg02w,3ocfrqgUMVWA2PeorZx40TW0c-FiIOL-TWKQHoDbVdE,AQABAAE';
 $splitedURL['site_name'] = 'Index';
-$splitedURL['edition'] = '21';
+$splitedURL['edition'] = '34';
+$splitedURL['path'] = 'all.html';


+
 $path = $bot->constructURL($splitedURL);
 $bot->getDistantFile($path);

-echo $bot->extractTitle();
+$title = $bot->extractTitle();
+$metas = $bot->extractMetas();
+$urls = $bot->extractURLs();
+$bot->cleanURLs($urls, $splitedURL['key_type'], $splitedURL['key_value'], 
$splitedURL['site_name'], $splitedURL['edition']);

-$bot->dbAddFreesite($splitedURL);
+foreach ( $urls as $value )
+{
+       
+       if ( !empty($value) )
+       {
+               $splitedURL = $bot->splitURL($value);
+               echo $value;
+               print_r($splitedURL);
+               $id_freesite = $bot->dbGetFreesiteId($splitedURL);
+               if ( $id_freesite === false )
+                       $id_freesite = $bot->dbAddFreesite($splitedURL);

+               $bot->dbAddFreesiteURL($id_freesite, $splitedURL['path']);
+               
+               
+               
+               
+       }
+       
+}
+
+print_r($urls);
+
+$insert_id = $bot->dbAddFreesite($splitedURL);
+$bot->dbAddFreesiteInformations($insert_id, $title, $metas);
+
+
+
+$url = '/USK at 
60I8H8HinpgZSOuTSD66AVlIFAy-xsppFr0YCzCar7c,NzdivUGCGOdlgngOGRbbKDNfSCnjI0FXjHLzJM4xkJ4,AQABAAE/ind-ex/test';
+$splitedURL = $bot->splitURL($url);
+print_r($splitedURL);
+
+
 echo "\r\nDarknetSpiderBot is closing...\r\n";
 ?>
\ No newline at end of file

Modified: trunk/apps/DarknetSpiderBot/class/bot.class.php
===================================================================
--- trunk/apps/DarknetSpiderBot/class/bot.class.php     2006-06-03 16:07:15 UTC 
(rev 9029)
+++ trunk/apps/DarknetSpiderBot/class/bot.class.php     2006-06-03 16:53:25 UTC 
(rev 9030)
@@ -8,6 +8,8 @@
        var $buffer;
        var $buffer_file;

+       var $current_urls;
+       
        // Constructor
        function bot ($fcp_host, $fcp_port, $buffer_file)
        {
@@ -78,18 +80,35 @@
                $splitedURL['key_type'] = substr($url, 1, 3);

                $second_slashe_pos = strpos($url, '/', 5);
-               $splitedURL['key_value'] = substr($url, 5, 
$second_slashe_pos-5);

+               
                if ( $splitedURL['key_type'] == 'CHK' )
                {
-                       $splitedURL['path'] = substr($url, 
$second_slashe_pos+1);
+                       if ($second_slashe_pos != 0)
+                       {
+                               $splitedURL['key_value'] = substr($url, 5, 
$second_slashe_pos-5);
+                               $splitedURL['path'] = substr($url, 
$second_slashe_pos+1);
+                       }
+                       else
+                       {
+                               $splitedURL['key_value'] = substr($url, 5);
+                       }
                }
                else
                {
-                       preg_match('#^(.+)[/-]+([0-9]+)(.*)$#', substr($url, 
$second_slashe_pos+1), $matches );
-                       $splitedURL['site_name'] = $matches[1];
-                       $splitedURL['edition'] = $matches[2];
-                       $splitedURL['path'] = $matches[3];
+                       //$path = substr($url, $second_slashe_pos+1);
+                       $splitedURL['key_value'] = substr($url, 5, 
$second_slashe_pos-5);
+                       
+                       if ( preg_match('#^([^/]+)[/-]+([0-9]+)/*(.*)$#', 
substr($url, $second_slashe_pos+1), $matches ) )
+                       {
+                               $splitedURL['site_name'] = $matches[1];
+                               $splitedURL['edition'] = $matches[2];
+                               $splitedURL['path'] = $matches[3];
+                       }
+                       else
+                       {
+                               $splitedURL['site_name'] = substr($url, 
$second_slashe_pos+1);
+                       }
                }

                if ( substr($splitedURL['path'], 0, 1) == '/' )
@@ -124,7 +143,7 @@



-       function cleanURLs (&$urls, $sitekey, $sitename)
+       function cleanURLs (&$urls, $key_type, $key_value, $site_name, $edition)
        {
                // todo: support des ../

@@ -133,7 +152,7 @@

                        $value = trim($value);

-                       if ( substr($value, 0, 7) == 'http://') // si l'url 
commence par http://, on la retire
+                       if ( substr($value, 0, 7) == 'http://' || 
substr($value, 0, 13) == '/?newbookmark' ) // si l'url commence par http://, on 
la retire
                        {
                                $value = '';
                        }
@@ -143,14 +162,14 @@
                                        $value = substr($value, 2);

                                // on ajoute $sitepath
-                               $value = '/'.$sitekey.'/'.$sitename.'/'.$value;
+                               $value = 
'/'.$key_type.'@'.$key_value.'/'.$site_name.'-'.$edition.'/'.$value;
                        }

                        if ( substr($value, -1) == '/') // si l'url fini par un 
slash, on le retire
                                $value = substr($value, 0, -1);

                        // On retire les liens vers les diverses versions
-                       if ( 
preg_match("#^/[A-Z]{3,3}@$sitekey/$sitename/?-[0-9]+$#i", $value, $matches, 
PREG_OFFSET_CAPTURE) )
+                       if ( 
preg_match("#^/[A-Z]{3,3}@$key_value/$site_name/?-[0-9]+$#i", $value, $matches, 
PREG_OFFSET_CAPTURE) )
                                $value = '';

                        // mise ? jour de l'url
@@ -181,21 +200,21 @@
                                }

                                if ( !empty($buf['name']) && 
!empty($buf['content']) )
-                                       $meta[$buf['name']] = $buf['content'];
+                                       $metas[$buf['name']] = $buf['content'];

                                unset($buf);

                        }
                }

-               return $meta;
+               return $metas;

        }

        function extractURLs ()
        {

-           if ( preg_match_all('/<a href="(.*?)".*>/i', 
$this->buffer_contents, $matches) )  
+           if ( preg_match_all('/<a href="(.*?)".*>/i', $this->buffer, 
$matches) )  
                return $matches[1];

        }
@@ -208,7 +227,43 @@
                        $splitedURL['key_type'] = 'SSK';

                mysql_query("INSERT INTO freesites_keys ( key_type, key_value, 
site_name, edition, created, last_update ) VALUES ('$splitedURL[key_type]', 
'$splitedURL[key_value]', '$splitedURL[site_name]', '$splitedURL[edition]', 
NOW(), NOW() ) ");
+               
+               return mysql_insert_id();
        }
+       
+       function dbGetFreesiteId ($splitedURL)
+       {
+               
+               $result = mysql_query("SELECT id FROM freesites_keys WHERE 
key_value = '$splitedURL[key_value]' ");
+               if ( mysql_num_rows($result) > 0 )
+               {
+                       list($id_freesite) = mysql_fetch_row($result);
+                       return $id_freesite;
+               }
+               else
+                       return false;
+       }
+       
+       function dbAddFreesiteInformations ($id_freesite, $title, $metas)
+       {
+               
+               mysql_query("INSERT INTO freesites_informations ( id_freesite, 
title, meta_description, meta_keywords ) VALUES ( '$id_freesite', '$title', 
'$metas[description]', '$metas[keywords]' ) ");
+       }
+       
+       function dbAddFreesiteURL ($id_freesite, $url)
+       {
+               
+               mysql_query("INSERT INTO freesites_urls ( id_freesite, path ) 
VALUES ( '$id_freesite', '$url' ) ");
+       }
+       
+       function requestingFreesite ($splitedURL)
+       {
+               
+               $path = $this->constructURL($splitedURL);
+               $this->getDistantFile($path);
+               
+               
+       }

 }


Modified: trunk/apps/DarknetSpiderBot/setup/darknetspiderbot.sql
===================================================================
--- trunk/apps/DarknetSpiderBot/setup/darknetspiderbot.sql      2006-06-03 
16:07:15 UTC (rev 9029)
+++ trunk/apps/DarknetSpiderBot/setup/darknetspiderbot.sql      2006-06-03 
16:53:25 UTC (rev 9030)
@@ -1,72 +1,58 @@
--- phpMyAdmin SQL Dump
--- version 2.6.1-rc2
--- http://www.phpmyadmin.net
--- 
--- Serveur: localhost
--- G?n?r? le : Jeudi 13 Avril 2006 ? 22:15
--- Version du serveur: 4.0.20
--- Version de PHP: 4.3.4
--- 
--- Base de donn?es: `darknetspiderbot`
--- 
-
--- --------------------------------------------------------
-
--- 
--- Structure de la table `freesites_informations`
--- 
-
-CREATE TABLE `freesites_informations` (
-  `id_freesites` smallint(5) unsigned NOT NULL default '0',
-  `title` varchar(255) NOT NULL default '',
-  `meta_description` varchar(255) NOT NULL default '',
-  `meta_keywords` text NOT NULL,
-  UNIQUE KEY `id_freesites` (`id_freesites`)
-) TYPE=MyISAM;
-
--- 
--- Contenu de la table `freesites_informations`
--- 
-
-
--- --------------------------------------------------------
-
--- 
--- Structure de la table `freesites_keys`
--- 
-
-CREATE TABLE `freesites_keys` (
-  `id` smallint(5) unsigned NOT NULL auto_increment,
-  `key_type` enum('CHK','SSK') NOT NULL default 'CHK',
-  `key_value` varchar(255) NOT NULL default '',
-  `site_name` varchar(255) NOT NULL default '',
-  `edition` smallint(5) unsigned NOT NULL default '0',
-  `created` timestamp(14) NOT NULL,
-  `last_update` timestamp(14) NOT NULL default '00000000000000',
-  PRIMARY KEY  (`id`),
-  UNIQUE KEY `key_value` (`key_value`),
-  KEY `last_update` (`last_update`)
-) TYPE=MyISAM AUTO_INCREMENT=1 ;
-
--- 
--- Contenu de la table `freesites_keys`
--- 
-
-
--- --------------------------------------------------------
-
--- 
--- Structure de la table `freesites_urls`
--- 
-
-CREATE TABLE `freesites_urls` (
-  `id_freesites` smallint(6) NOT NULL default '0',
-  `url` varchar(255) NOT NULL default '',
-  `status` enum('standby','retrieving','retrieved','error') NOT NULL default 
'standby',
-  KEY `id_freesites` (`id_freesites`)
-) TYPE=MyISAM;
-
--- 
--- Contenu de la table `freesites_urls`
--- 
-
+-- phpMyAdmin SQL Dump
+-- version 2.6.1
+-- http://www.phpmyadmin.net
+-- 
+-- Serveur: localhost
+-- G?n?r? le : Samedi 03 Juin 2006 ? 18:45
+-- Version du serveur: 4.1.9
+-- Version de PHP: 4.3.10
+-- 
+-- Base de donn?es: `darknetspiderbot`
+-- 
+
+-- --------------------------------------------------------
+
+-- 
+-- Structure de la table `freesites_informations`
+-- 
+
+CREATE TABLE `freesites_informations` (
+  `id_freesite` smallint(5) unsigned NOT NULL default '0',
+  `title` varchar(255) NOT NULL default '',
+  `meta_description` varchar(255) NOT NULL default '',
+  `meta_keywords` text NOT NULL,
+  UNIQUE KEY `id_freesite` (`id_freesite`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1;
+
+-- --------------------------------------------------------
+
+-- 
+-- Structure de la table `freesites_keys`
+-- 
+
+CREATE TABLE `freesites_keys` (
+  `id` smallint(5) unsigned NOT NULL auto_increment,
+  `key_type` enum('CHK','SSK') NOT NULL default 'CHK',
+  `key_value` varchar(255) NOT NULL default '',
+  `site_name` varchar(255) NOT NULL default '',
+  `edition` smallint(5) unsigned NOT NULL default '0',
+  `created` timestamp NOT NULL default CURRENT_TIMESTAMP on update 
CURRENT_TIMESTAMP,
+  `last_update` timestamp NOT NULL default '0000-00-00 00:00:00',
+  PRIMARY KEY  (`id`),
+  UNIQUE KEY `key_value` (`key_value`),
+  KEY `last_update` (`last_update`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 AUTO_INCREMENT=1 ;
+
+-- --------------------------------------------------------
+
+-- 
+-- Structure de la table `freesites_urls`
+-- 
+
+CREATE TABLE `freesites_urls` (
+  `id_freesite` smallint(5) unsigned NOT NULL default '0',
+  `path` varchar(255) NOT NULL default '',
+  `status` enum('standby','retrieving','retrieved','error') NOT NULL default 
'standby',
+  KEY `id_freesite` (`id_freesite`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1;
+        
\ No newline at end of file


Reply via email to