ID:               27242
 Updated by:       [EMAIL PROTECTED]
 Reported By:      amix at amix dot dk
-Status:           Open
+Status:           Bogus
 Bug Type:         XML related
 Operating System: WIndows and Mac OS X
 PHP Version:      4.3.5RC2
 New Comment:

Thank you for taking the time to write to us, but this is not
a bug. Please double-check the documentation available at
http://www.php.net/manual/ and the instructions on how to report
a bug at http://bugs.php.net/how-to-report.php

Of course you get strange results if you try to parse 2 bytes per
time..




Previous Comments:
------------------------------------------------------------------------

[2004-02-13 12:35:06] amix at amix dot dk

Description:
------------
I am have made a script to parse the DMOZ RDF XML files. Which are HUGE
(one is 500 MB and the other is 1.2 GB).



The major problem I've got is that the XML parser outputs some bullshit
when parsing those large files!



The problem is how many bytes to read. I.e. this code:

      while ($data = fread($fp, 4096))



Now I have fixed this problem by loading the whole file into the
memory:

      while ($data = fread($fp, filesize($this->xml_file)))

      {



It takes some minutes to loade the 500 MB file, but can't do that with
the 1.2 GB file.



Ok, now I have searched a big deal on google. I have looked on how
other parse XML files - and all I have seen use fread (including some
PEAR scripts etc.)



I have also made an example which shows the code on a smaller scale. If
you set fread to read 2 bytes per time - then it makes some weird
output.

Reproduce code:
---------------
This example is taken from a book.

<?php 

$currentTag = ""; 



$fields = array(); 

$values = array(); 



$xml_file="data.xml"; 



function startElementHandler($parser, $name, $attributes) 

{

      global $currentTag, $table; 

      $currentTag = $name; 



      if (strtolower($currentTag) == "table") 

      {

            $table = $attributes["name"]; 

      } 



} 



function endElementHandler($parser, $name) 

{

      global $fields, $values, $count, $currentTag; 



      global $connection, $table; 



      if (strtolower($name) == "record") 

      {

            $query = "INSERT INTO $table"; 

            $query .= "(" . join(", ", $fields) . ")"; 

            $query .= " VALUES(\"" . join("\", \"", $values) . "\");";




          echo "$query\n";



            $fields = array(); 

            $values = array(); 

            $count = 0; 

            $currentTag = ""; 

      } 



} 



function characterDataHandler($parser, $data) 

{

      global $fields, $values, $currentTag, $count; 

      if (trim($data) != "") 

      {

            $fields[$count] = $currentTag; 



            $values[$count] = mysql_escape_string($data); 

            $count++; 

      } 

} 



$xml_parser = xml_parser_create(); 



xml_parser_set_option($xml_parser,XML_OPTION_SKIP_WHITE, TRUE); 





xml_parser_set_option($xml_parser, XML_OPTION_CASE_FOLDING, FALSE); 



xml_set_element_handler($xml_parser, "startElementHandler",
"endElementHandler"); 

xml_set_character_data_handler($xml_parser, "characterDataHandler"); 



if (!($fp = fopen($xml_file, "rb"))) 

{

      die("File I/O error: $xml_file"); 

} 



while ($data = fread($fp, 2)) 

{

      if (!xml_parse($xml_parser, $data, feof($fp))) 

      {

            $ec = xml_get_error_code($xml_parser); 

            die("XML parser error (error code " . $ec . "): " .
xml_error_string($ec) . 

"<br>Error occurred at line " .
xml_get_current_line_number($xml_parser)); 

      } 

} 



xml_parser_free($xml_parser); 





?> 



data.xml

<?xml version="1.0"?> 

<table name="readings"> 

      <record> 

           
<a>56565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656</a>


           
<b>12565656565656565656565656565656565656565656565656565656565656565622</b>


           
<c>785656565656565656565656565656565656565656565656565656565656565656.5</c>


      </record> 

      <record> 

           
<x>456565656565656565656565656565656565656565656565656565656565656565</x>


           
<y>-565656565656565656565656565656565656565656565656565656565656565610</y>


      </record> 

      <record> 

           
<x>156565656565656565656565656565656565656565656565656565656565656562</x>


           
<b>105656565656565656565656565656565656565656565656565656565656565656459</b>


           
<a>7565656565656565656565656565656565656565656565656565656565656565656</a>


           
<y>95656565656565656565656565656565656565656565656565656565656565656</y>


      </record> 

</table>



Expected result:
----------------
INSERT INTO readings(a, b, c)
VALUES("56565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656",
"12565656565656565656565656565656565656565656565656565656565656565622",
"785656565656565656565656565656565656565656565656565656565656565656.5");
INSERT INTO readings(x, y)
VALUES("456565656565656565656565656565656565656565656565656565656565656565",
"-565656565656565656565656565656565656565656565656565656565656565610");
INSERT INTO readings(x, b, a, y)
VALUES("156565656565656565656565656565656565656565656565656565656565656562",
"105656565656565656565656565656565656565656565656565656565656565656459",
"7565656565656565656565656565656565656565656565656565656565656565656",
"95656565656565656565656565656565656565656565656565656565656565656");



Actual result:
--------------
INSERT INTO readings(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a,
a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a,
a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, b,
b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
b, b, b, b, b, b, b, b, b, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c,
c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c) VALUES("56",
"56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56",
"56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56",
"56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56",
"56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56",
"56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56",
"56", "56", "56", "12", "56", "56", "56", "56", "56", "56", "56", "56",
"56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56",
"56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56",
"22", "78", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56",
"56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56",
"56", "56", "56", "56", "56", "56", "56", "56", "56", "56", ".5");
INSERT INTO readings(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, y, y, y, y, y, y, y,
y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
y, y, y) VALUES("4", "56", "56", "56", "56", "56", "56", "56", "56",
"56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56",
"56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56",
"5", "-", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56",
"56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56",
"56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "10");
INSERT INTO readings(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, b, b, b, b, b, b, b,
b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
b, b, b, b, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a,
a, a, a, a, a, a, a, a, a, a, a, a, a, a, y, y, y, y, y, y, y, y, y, y,
y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y)
VALUES("1", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56",
"56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56",
"56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "2", "1",
"05", "65", "65", "65", "65", "65", "65", "65", "65", "65", "65", "65",
"65", "65", "65", "65", "65", "65", "65", "65", "65", "65", "65", "65",
"65", "65", "65", "65", "65", "65", "65", "65", "64", "59", "75", "65",
"65", "65", "65", "65", "65", "65", "65", "65", "65", "65", "65", "65",
"65", "65", "65", "65", "65", "65", "65", "65", "65", "65", "65", "65",
"65", "65", "65", "65", "65", "65", "65", "6", "9", "56", "56", "56",
"56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56",
"56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56", "56",
"56", "56", "56", "56", "56");




------------------------------------------------------------------------


-- 
Edit this bug report at http://bugs.php.net/?id=27242&edit=1

Reply via email to