[Bug-wget] bug #28541: Faulty time information in FTP directory listing

2010-07-26 Thread John Trengrove
This is a patch to change the behaviour for FTP directory listing.
Currently the hours are printed only if the hour is non-zero and does
not account for if we received the hours  minutes or not. A
simplistic patch would be to just create an else if statement in
ftp_index to check if the minutes are non-zero also. This fails though
for the case 00:00. I am uncertain how much this matters.

I created a more complicated patch (below) that alters the struct
fileinfo in ftp.h to hold whether hours:minutes were stored or not. It
is assumes only UNIX FTP servers fail to provide the time for old
entries.

This would be my first contribution to wget. Criticism/feedback encouraged.

ChangeLog

2010-07-25  John Trengrove  jtrengr...@gmail.com

  * ftp.h: Modified struct to hold parsetype.
  Added enum for parsetype.
  * ftp-ls.c:
  (ftp_parse_unix_ls): Default to TT_DAY. Change to TT_HOUR_MIN if
hours/minutes parsed.
  (ftp_parse_winnt_ls): Default to TT_HOUR_MIN.
  (ftp_parse_vms_ls): Default to TT_HOUR_MIN.
  (ftp_index): Print only if fileinfo struct value ttype set to TT_HOUR_MIN.

Patch

=== modified file 'src/ftp-ls.c'
--- src/ftp-ls.c 2010-05-08 19:56:15 +
+++ src/ftp-ls.c 2010-07-25 06:00:04 +
@@ -100,7 +100,7 @@
  };
  int next, len, i, error, ignore;
  int year, month, day;         /* for time analysis */
-  int hour, min, sec;
+  int hour, min, sec, ptype;
  struct tm timestruct, *tnow;
  time_t timenow;

@@ -183,6 +183,7 @@
                                   treated equally for now.  */
      year = hour = min = sec = 0; /* Silence the compiler.  */
      month = day = 0;
+      ptype = TT_DAY;
      next = -1;
      /* While there are tokens on the line, parse them.  Next is the
         number of tokens left until the filename.
@@ -262,6 +263,7 @@
                      /* This means these were hours!  */
                      hour = year;
                      year = 0;
+                      ptype = TT_HOUR_MIN;
                      ++tok;
                      /* Get the minutes...  */
                      for (; c_isdigit (*tok); tok++)
@@ -414,6 +416,7 @@
      timestruct.tm_yday  = 0;
      timestruct.tm_isdst = -1;
      l-tstamp = mktime (timestruct); /* store the time-stamp */
+      l-ptype = ptype;

      xfree (line);
    }
@@ -501,6 +504,7 @@
      timestruct.tm_yday  = 0;
      timestruct.tm_isdst = -1;
      cur.tstamp = mktime (timestruct); /* store the time-stamp */
+      cur.ptype = TT_HOUR_MIN;

      DEBUGP((Timestamp: %ld\n, cur.tstamp));

@@ -987,6 +991,7 @@
        }
      cur.tstamp = timenow; /* Store the time-stamp. */
      DEBUGP((Timestamp: %ld\n, cur.tstamp));
+      cur.ptype = TT_HOUR_MIN;

      /* Add the data for this item to the linked list, */
      if (!dir)
@@ -1134,7 +1139,7 @@

          fprintf (fp, %d %s %02d , ptm-tm_year + 1900, months[ptm-tm_mon],
                  ptm-tm_mday);
-          if (ptm-tm_hour)
+          if (f-ptype == TT_HOUR_MIN)
            fprintf (fp, %02d:%02d  , ptm-tm_hour, ptm-tm_min);
          else
            fprintf (fp,        );

=== modified file 'src/ftp.h'
--- src/ftp.h 2010-05-08 19:56:15 +
+++ src/ftp.h 2010-07-25 05:58:22 +
@@ -87,6 +87,12 @@
  GLOB_GLOBALL, GLOB_GETALL, GLOB_GETONE
};

+/* Used by to test if time parsed includes hours and minutes. */
+enum parsetype
+{
+  TT_HOUR_MIN, TT_DAY
+};
+
/* Information about one filename in a linked list.  */
struct fileinfo
{
@@ -94,6 +100,7 @@
  char *name; /* file name */
  wgint size; /* file size */
  long tstamp; /* time-stamp */
+  enum parsetype ptype; /* time parsing */
  int perms; /* file permissions */
  char *linkto; /* link to which file points */
  struct fileinfo *prev; /* previous... */



Re: [Bug-wget] bug #28541: Faulty time information in FTP directory listing

2010-07-26 Thread Giuseppe Scrivano
Thanks for your contribution!  It looks good but I want to check the
patch better before use it.

Cheers,
Giuseppe



John Trengrove jtrengr...@gmail.com writes:

 This is a patch to change the behaviour for FTP directory listing.
 Currently the hours are printed only if the hour is non-zero and does
 not account for if we received the hours  minutes or not. A
 simplistic patch would be to just create an else if statement in
 ftp_index to check if the minutes are non-zero also. This fails though
 for the case 00:00. I am uncertain how much this matters.

 I created a more complicated patch (below) that alters the struct
 fileinfo in ftp.h to hold whether hours:minutes were stored or not. It
 is assumes only UNIX FTP servers fail to provide the time for old
 entries.

 This would be my first contribution to wget. Criticism/feedback encouraged.

 ChangeLog

 2010-07-25  John Trengrove  jtrengr...@gmail.com

   * ftp.h: Modified struct to hold parsetype.
   Added enum for parsetype.
   * ftp-ls.c:
   (ftp_parse_unix_ls): Default to TT_DAY. Change to TT_HOUR_MIN if
 hours/minutes parsed.
   (ftp_parse_winnt_ls): Default to TT_HOUR_MIN.
   (ftp_parse_vms_ls): Default to TT_HOUR_MIN.
   (ftp_index): Print only if fileinfo struct value ttype set to TT_HOUR_MIN.

 Patch

 === modified file 'src/ftp-ls.c'
 --- src/ftp-ls.c 2010-05-08 19:56:15 +
 +++ src/ftp-ls.c 2010-07-25 06:00:04 +
 @@ -100,7 +100,7 @@
   };
   int next, len, i, error, ignore;
   int year, month, day;         /* for time analysis */
 -  int hour, min, sec;
 +  int hour, min, sec, ptype;
   struct tm timestruct, *tnow;
   time_t timenow;

 @@ -183,6 +183,7 @@
                                    treated equally for now.  */
       year = hour = min = sec = 0; /* Silence the compiler.  */
       month = day = 0;
 +      ptype = TT_DAY;
       next = -1;
       /* While there are tokens on the line, parse them.  Next is the
          number of tokens left until the filename.
 @@ -262,6 +263,7 @@
                       /* This means these were hours!  */
                       hour = year;
                       year = 0;
 +                      ptype = TT_HOUR_MIN;
                       ++tok;
                       /* Get the minutes...  */
                       for (; c_isdigit (*tok); tok++)
 @@ -414,6 +416,7 @@
       timestruct.tm_yday  = 0;
       timestruct.tm_isdst = -1;
       l-tstamp = mktime (timestruct); /* store the time-stamp */
 +      l-ptype = ptype;

       xfree (line);
     }
 @@ -501,6 +504,7 @@
       timestruct.tm_yday  = 0;
       timestruct.tm_isdst = -1;
       cur.tstamp = mktime (timestruct); /* store the time-stamp */
 +      cur.ptype = TT_HOUR_MIN;

       DEBUGP((Timestamp: %ld\n, cur.tstamp));

 @@ -987,6 +991,7 @@
         }
       cur.tstamp = timenow; /* Store the time-stamp. */
       DEBUGP((Timestamp: %ld\n, cur.tstamp));
 +      cur.ptype = TT_HOUR_MIN;

       /* Add the data for this item to the linked list, */
       if (!dir)
 @@ -1134,7 +1139,7 @@

           fprintf (fp, %d %s %02d , ptm-tm_year + 1900, 
 months[ptm-tm_mon],
                   ptm-tm_mday);
 -          if (ptm-tm_hour)
 +          if (f-ptype == TT_HOUR_MIN)
             fprintf (fp, %02d:%02d  , ptm-tm_hour, ptm-tm_min);
           else
             fprintf (fp,        );

 === modified file 'src/ftp.h'
 --- src/ftp.h 2010-05-08 19:56:15 +
 +++ src/ftp.h 2010-07-25 05:58:22 +
 @@ -87,6 +87,12 @@
   GLOB_GLOBALL, GLOB_GETALL, GLOB_GETONE
 };

 +/* Used by to test if time parsed includes hours and minutes. */
 +enum parsetype
 +{
 +  TT_HOUR_MIN, TT_DAY
 +};
 +
 /* Information about one filename in a linked list.  */
 struct fileinfo
 {
 @@ -94,6 +100,7 @@
   char *name; /* file name */
   wgint size; /* file size */
   long tstamp; /* time-stamp */
 +  enum parsetype ptype; /* time parsing */
   int perms; /* file permissions */
   char *linkto; /* link to which file points */
   struct fileinfo *prev; /* previous... */



Re: [Bug-wget] downloading links in a dynamic site

2010-07-26 Thread Keisial
 Vinh Nguyen wrote:
 Dear list,

 My goal is to download some pdf files from a dynamic site (not sure on
 the terminology).  For example, I would execute:

 wget -U firefox -r -l1 -nd -e robots=off -A '*.pdf,*.pdf.*'
 http://site.com/?sortorder=ascp_o=0

 and would get my 10 pdf files.  On the page I can click a Next link
 (to have more files), and I execute:

 wget -U firefox -r -l1 -nd -e robots=off -A '*.pdf,*.pdf.*'
 http://site.com/?sortorder=ascp_o=10

 However, the downloaded files are identical to the previous.  I tried
 the cookies setting and referer setting:

 wget -U firefox --cookies=on --keep-session-cookies
 --save-cookies=cookie.txt -r -l1 -nd -e robots=off -A '*.pdf,*.pdf.*'
 http://site.com/?sortorder=ascp_o=0
 wget -U firefox --referer='http://site.com/?sortorder=ascp_o=0'
 --cookies=on --load-cookies=cookie.txt --keep-session-cookies
 --save-cookies=cookie.txt -r -l1 -nd -e robots=off -A '*.pdf,*.pdf.*'
 http://site.com/?sortorder=ascp_o=10

 but the results again are identical.  Any suggestions?

 Thanks.
 Vinh

Look at the page source how they are generating the urls.
Maybe they are using some ugly javascript, although that discards
the benefit of paging...




Re: [Bug-wget] downloading links in a dynamic site

2010-07-26 Thread Vinh Nguyen
On Mon, Jul 26, 2010 at 1:51 PM, Vinh Nguyen vinhdi...@gmail.com wrote:
 That's displayed in the source.  Also, when i try to manually enter
 the url changing =10, =20, =30, I get the right page, so I don't think
 it's a javascript issue.  What else could it be besides referer and
 cookies?

Confirmed that it also works in a DIFFERENT browser (conkeror and
firefox).  Hmm, what can be the difference between wget and these
browsers?



Re: [Bug-wget] downloading links in a dynamic site

2010-07-26 Thread Vinh Nguyen
On Mon, Jul 26, 2010 at 2:02 PM, Vinh Nguyen vinhdi...@gmail.com wrote:
 On Mon, Jul 26, 2010 at 1:51 PM, Vinh Nguyen vinhdi...@gmail.com wrote:
 That's displayed in the source.  Also, when i try to manually enter
 the url changing =10, =20, =30, I get the right page, so I don't think
 it's a javascript issue.  What else could it be besides referer and
 cookies?

 Confirmed that it also works in a DIFFERENT browser (conkeror and
 firefox).  Hmm, what can be the difference between wget and these
 browsers?

This issue is RESOLVED.  Put 'quotes' around the url.  I thought I had
this the entire time.  Thanks everyone.

Vinh