Author: dongy
Date: 2006-06-03 16:53:25 +0000 (Sat, 03 Jun 2006)
New Revision: 9030
Modified:
trunk/apps/DarknetSpiderBot/DarknetSpiderBot.php
trunk/apps/DarknetSpiderBot/class/bot.class.php
trunk/apps/DarknetSpiderBot/setup/darknetspiderbot.sql
Log:
DarknetSpiderBot: Some bugs with regular expressions fixed; New fonctions added
in order to insert parsed urls into database;
Modified: trunk/apps/DarknetSpiderBot/DarknetSpiderBot.php
===================================================================
--- trunk/apps/DarknetSpiderBot/DarknetSpiderBot.php 2006-06-03 16:07:15 UTC
(rev 9029)
+++ trunk/apps/DarknetSpiderBot/DarknetSpiderBot.php 2006-06-03 16:53:25 UTC
(rev 9030)
@@ -9,18 +9,55 @@
$buffer_file = 'local.html';
$bot = new bot($fcp_host, $fcp_port, $buffer_file);
+
$splitedURL['key_type'] = 'SSK';
$splitedURL['key_value'] =
'PFeLTa1si2Ml5sDeUy7eDhPso6TPdmw-2gWfQ4Jg02w,3ocfrqgUMVWA2PeorZx40TW0c-FiIOL-TWKQHoDbVdE,AQABAAE';
$splitedURL['site_name'] = 'Index';
-$splitedURL['edition'] = '21';
+$splitedURL['edition'] = '34';
+$splitedURL['path'] = 'all.html';
+
$path = $bot->constructURL($splitedURL);
$bot->getDistantFile($path);
-echo $bot->extractTitle();
+$title = $bot->extractTitle();
+$metas = $bot->extractMetas();
+$urls = $bot->extractURLs();
+$bot->cleanURLs($urls, $splitedURL['key_type'], $splitedURL['key_value'],
$splitedURL['site_name'], $splitedURL['edition']);
-$bot->dbAddFreesite($splitedURL);
+foreach ( $urls as $value )
+{
+
+ if ( !empty($value) )
+ {
+ $splitedURL = $bot->splitURL($value);
+ echo $value;
+ print_r($splitedURL);
+ $id_freesite = $bot->dbGetFreesiteId($splitedURL);
+ if ( $id_freesite === false )
+ $id_freesite = $bot->dbAddFreesite($splitedURL);
+ $bot->dbAddFreesiteURL($id_freesite, $splitedURL['path']);
+
+
+
+
+ }
+
+}
+
+print_r($urls);
+
+$insert_id = $bot->dbAddFreesite($splitedURL);
+$bot->dbAddFreesiteInformations($insert_id, $title, $metas);
+
+
+
+$url = '/USK at
60I8H8HinpgZSOuTSD66AVlIFAy-xsppFr0YCzCar7c,NzdivUGCGOdlgngOGRbbKDNfSCnjI0FXjHLzJM4xkJ4,AQABAAE/ind-ex/test';
+$splitedURL = $bot->splitURL($url);
+print_r($splitedURL);
+
+
echo "\r\nDarknetSpiderBot is closing...\r\n";
?>
\ No newline at end of file
Modified: trunk/apps/DarknetSpiderBot/class/bot.class.php
===================================================================
--- trunk/apps/DarknetSpiderBot/class/bot.class.php 2006-06-03 16:07:15 UTC
(rev 9029)
+++ trunk/apps/DarknetSpiderBot/class/bot.class.php 2006-06-03 16:53:25 UTC
(rev 9030)
@@ -8,6 +8,8 @@
var $buffer;
var $buffer_file;
+ var $current_urls;
+
// Constructor
function bot ($fcp_host, $fcp_port, $buffer_file)
{
@@ -78,18 +80,35 @@
$splitedURL['key_type'] = substr($url, 1, 3);
$second_slashe_pos = strpos($url, '/', 5);
- $splitedURL['key_value'] = substr($url, 5,
$second_slashe_pos-5);
+
if ( $splitedURL['key_type'] == 'CHK' )
{
- $splitedURL['path'] = substr($url,
$second_slashe_pos+1);
+ if ($second_slashe_pos != 0)
+ {
+ $splitedURL['key_value'] = substr($url, 5,
$second_slashe_pos-5);
+ $splitedURL['path'] = substr($url,
$second_slashe_pos+1);
+ }
+ else
+ {
+ $splitedURL['key_value'] = substr($url, 5);
+ }
}
else
{
- preg_match('#^(.+)[/-]+([0-9]+)(.*)$#', substr($url,
$second_slashe_pos+1), $matches );
- $splitedURL['site_name'] = $matches[1];
- $splitedURL['edition'] = $matches[2];
- $splitedURL['path'] = $matches[3];
+ //$path = substr($url, $second_slashe_pos+1);
+ $splitedURL['key_value'] = substr($url, 5,
$second_slashe_pos-5);
+
+ if ( preg_match('#^([^/]+)[/-]+([0-9]+)/*(.*)$#',
substr($url, $second_slashe_pos+1), $matches ) )
+ {
+ $splitedURL['site_name'] = $matches[1];
+ $splitedURL['edition'] = $matches[2];
+ $splitedURL['path'] = $matches[3];
+ }
+ else
+ {
+ $splitedURL['site_name'] = substr($url,
$second_slashe_pos+1);
+ }
}
if ( substr($splitedURL['path'], 0, 1) == '/' )
@@ -124,7 +143,7 @@
- function cleanURLs (&$urls, $sitekey, $sitename)
+ function cleanURLs (&$urls, $key_type, $key_value, $site_name, $edition)
{
// todo: support des ../
@@ -133,7 +152,7 @@
$value = trim($value);
- if ( substr($value, 0, 7) == 'http://') // si l'url
commence par http://, on la retire
+ if ( substr($value, 0, 7) == 'http://' ||
substr($value, 0, 13) == '/?newbookmark' ) // si l'url commence par http://, on
la retire
{
$value = '';
}
@@ -143,14 +162,14 @@
$value = substr($value, 2);
// on ajoute $sitepath
- $value = '/'.$sitekey.'/'.$sitename.'/'.$value;
+ $value =
'/'.$key_type.'@'.$key_value.'/'.$site_name.'-'.$edition.'/'.$value;
}
if ( substr($value, -1) == '/') // si l'url fini par un
slash, on le retire
$value = substr($value, 0, -1);
// On retire les liens vers les diverses versions
- if (
preg_match("#^/[A-Z]{3,3}@$sitekey/$sitename/?-[0-9]+$#i", $value, $matches,
PREG_OFFSET_CAPTURE) )
+ if (
preg_match("#^/[A-Z]{3,3}@$key_value/$site_name/?-[0-9]+$#i", $value, $matches,
PREG_OFFSET_CAPTURE) )
$value = '';
// mise ? jour de l'url
@@ -181,21 +200,21 @@
}
if ( !empty($buf['name']) &&
!empty($buf['content']) )
- $meta[$buf['name']] = $buf['content'];
+ $metas[$buf['name']] = $buf['content'];
unset($buf);
}
}
- return $meta;
+ return $metas;
}
function extractURLs ()
{
- if ( preg_match_all('/<a href="(.*?)".*>/i',
$this->buffer_contents, $matches) )
+ if ( preg_match_all('/<a href="(.*?)".*>/i', $this->buffer,
$matches) )
return $matches[1];
}
@@ -208,7 +227,43 @@
$splitedURL['key_type'] = 'SSK';
mysql_query("INSERT INTO freesites_keys ( key_type, key_value,
site_name, edition, created, last_update ) VALUES ('$splitedURL[key_type]',
'$splitedURL[key_value]', '$splitedURL[site_name]', '$splitedURL[edition]',
NOW(), NOW() ) ");
+
+ return mysql_insert_id();
}
+
+ function dbGetFreesiteId ($splitedURL)
+ {
+
+ $result = mysql_query("SELECT id FROM freesites_keys WHERE
key_value = '$splitedURL[key_value]' ");
+ if ( mysql_num_rows($result) > 0 )
+ {
+ list($id_freesite) = mysql_fetch_row($result);
+ return $id_freesite;
+ }
+ else
+ return false;
+ }
+
+ function dbAddFreesiteInformations ($id_freesite, $title, $metas)
+ {
+
+ mysql_query("INSERT INTO freesites_informations ( id_freesite,
title, meta_description, meta_keywords ) VALUES ( '$id_freesite', '$title',
'$metas[description]', '$metas[keywords]' ) ");
+ }
+
+ function dbAddFreesiteURL ($id_freesite, $url)
+ {
+
+ mysql_query("INSERT INTO freesites_urls ( id_freesite, path )
VALUES ( '$id_freesite', '$url' ) ");
+ }
+
+ function requestingFreesite ($splitedURL)
+ {
+
+ $path = $this->constructURL($splitedURL);
+ $this->getDistantFile($path);
+
+
+ }
}
Modified: trunk/apps/DarknetSpiderBot/setup/darknetspiderbot.sql
===================================================================
--- trunk/apps/DarknetSpiderBot/setup/darknetspiderbot.sql 2006-06-03
16:07:15 UTC (rev 9029)
+++ trunk/apps/DarknetSpiderBot/setup/darknetspiderbot.sql 2006-06-03
16:53:25 UTC (rev 9030)
@@ -1,72 +1,58 @@
--- phpMyAdmin SQL Dump
--- version 2.6.1-rc2
--- http://www.phpmyadmin.net
---
--- Serveur: localhost
--- G?n?r? le : Jeudi 13 Avril 2006 ? 22:15
--- Version du serveur: 4.0.20
--- Version de PHP: 4.3.4
---
--- Base de donn?es: `darknetspiderbot`
---
-
--- --------------------------------------------------------
-
---
--- Structure de la table `freesites_informations`
---
-
-CREATE TABLE `freesites_informations` (
- `id_freesites` smallint(5) unsigned NOT NULL default '0',
- `title` varchar(255) NOT NULL default '',
- `meta_description` varchar(255) NOT NULL default '',
- `meta_keywords` text NOT NULL,
- UNIQUE KEY `id_freesites` (`id_freesites`)
-) TYPE=MyISAM;
-
---
--- Contenu de la table `freesites_informations`
---
-
-
--- --------------------------------------------------------
-
---
--- Structure de la table `freesites_keys`
---
-
-CREATE TABLE `freesites_keys` (
- `id` smallint(5) unsigned NOT NULL auto_increment,
- `key_type` enum('CHK','SSK') NOT NULL default 'CHK',
- `key_value` varchar(255) NOT NULL default '',
- `site_name` varchar(255) NOT NULL default '',
- `edition` smallint(5) unsigned NOT NULL default '0',
- `created` timestamp(14) NOT NULL,
- `last_update` timestamp(14) NOT NULL default '00000000000000',
- PRIMARY KEY (`id`),
- UNIQUE KEY `key_value` (`key_value`),
- KEY `last_update` (`last_update`)
-) TYPE=MyISAM AUTO_INCREMENT=1 ;
-
---
--- Contenu de la table `freesites_keys`
---
-
-
--- --------------------------------------------------------
-
---
--- Structure de la table `freesites_urls`
---
-
-CREATE TABLE `freesites_urls` (
- `id_freesites` smallint(6) NOT NULL default '0',
- `url` varchar(255) NOT NULL default '',
- `status` enum('standby','retrieving','retrieved','error') NOT NULL default
'standby',
- KEY `id_freesites` (`id_freesites`)
-) TYPE=MyISAM;
-
---
--- Contenu de la table `freesites_urls`
---
-
+-- phpMyAdmin SQL Dump
+-- version 2.6.1
+-- http://www.phpmyadmin.net
+--
+-- Serveur: localhost
+-- G?n?r? le : Samedi 03 Juin 2006 ? 18:45
+-- Version du serveur: 4.1.9
+-- Version de PHP: 4.3.10
+--
+-- Base de donn?es: `darknetspiderbot`
+--
+
+-- --------------------------------------------------------
+
+--
+-- Structure de la table `freesites_informations`
+--
+
+CREATE TABLE `freesites_informations` (
+ `id_freesite` smallint(5) unsigned NOT NULL default '0',
+ `title` varchar(255) NOT NULL default '',
+ `meta_description` varchar(255) NOT NULL default '',
+ `meta_keywords` text NOT NULL,
+ UNIQUE KEY `id_freesite` (`id_freesite`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1;
+
+-- --------------------------------------------------------
+
+--
+-- Structure de la table `freesites_keys`
+--
+
+CREATE TABLE `freesites_keys` (
+ `id` smallint(5) unsigned NOT NULL auto_increment,
+ `key_type` enum('CHK','SSK') NOT NULL default 'CHK',
+ `key_value` varchar(255) NOT NULL default '',
+ `site_name` varchar(255) NOT NULL default '',
+ `edition` smallint(5) unsigned NOT NULL default '0',
+ `created` timestamp NOT NULL default CURRENT_TIMESTAMP on update
CURRENT_TIMESTAMP,
+ `last_update` timestamp NOT NULL default '0000-00-00 00:00:00',
+ PRIMARY KEY (`id`),
+ UNIQUE KEY `key_value` (`key_value`),
+ KEY `last_update` (`last_update`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 AUTO_INCREMENT=1 ;
+
+-- --------------------------------------------------------
+
+--
+-- Structure de la table `freesites_urls`
+--
+
+CREATE TABLE `freesites_urls` (
+ `id_freesite` smallint(5) unsigned NOT NULL default '0',
+ `path` varchar(255) NOT NULL default '',
+ `status` enum('standby','retrieving','retrieved','error') NOT NULL default
'standby',
+ KEY `id_freesite` (`id_freesite`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1;
+
\ No newline at end of file