Hi Sean, sorry I didn't see your comments earlier.

On Wed, Jul 15, 2009 at 01:14:46AM +0200, sean finney wrote:
> in your comments you say:
> 
>       /* Parse an ISO-6709 date as used in zone.tab. Returns end of the
> 
> s/date/coordinate/ ? :)

Hah! I've fixed this in patch v7.

> but more seriously:
> 
> * in create_zone_index() it looks like
>   find_zone_info(system_zone_info...) is called but system_zone_info
>   isn't initialized until after create_zone_index has returned.

This was fixed in v6.

> * i'm not sure but it seems like there might be a few corresponding free()'s
>   missing from the mallocs/strdups.

Yeah, everything here is malloc'ed but never free'd.  We could set it up 
to be free'd in the date extension's global destructor, but, it would 
require modifying php_date.c which I am reluctant to do.  I'm not sure 
it makes much difference; the memory will remain allocated for the 
lifetime of the process in any case, right?

> * it looks like load_zone_table() is called unconditionally, even if
>   it's not needed (i.e. a scall to localtime() will trigger it)

I think this is necessary, though I didn't realize that before v6, and 
didn't explain it at all in the v6 comments.  I've added some better 
commentary now in v7 - the problem is that we need the fake tzdb->data 
array to be created once the tzdb is created, since 
timezone_identifiers_list() peeks directly into it.

> * is parsing the comments really necessary?  i don't see any php functions
>   that use them.

They are exposed by the getLocation() interface -

$ php -r '$m = new DateTimeZone("America/New_York"); \
          print_r($m->getLocation());'
Array
(
    [country_code] => US
    [latitude] => 40.71416
    [longitude] => -73.99362
    [comments] => Eastern Time
)

but yes they do seem to be kind of useless, in general.

> also, before i came to the conclusion that some kind of intermediate
> hashtable was needed to do the mapping (at which point i gave up and
> figured i'd wait to hear back from you :), i had done a slightly 
> simpler implementation of the parsing using a clean mmap'd buffer
> and sscanf with "%ms" type formats which avoids a lot of the hard-coded
> lengths etc[1].  if you're interested i can hack that code into the
> load_zone_info (and parse_iso6709) to make a leaner/cleaner/more efficient
> implementation.

Does it end up being much simpler without relying on sscanf/%ms?  I 
would rather this code is portable across older glibcs (e.g. so it works 
on older versions of RHEL).

> also, FYI my posts to the fedora list are being automatically rejected
> so if i say anything important that should be shared someone should
> forward it along :)

I've requested that you are whitelisted on the Fedora list. Thanks a lot 
for reviewing the patch, greatly appreciated!

Regards, Joe
Add support for use of the system timezone database, rather
than embedding a copy.  Discussed upstream but was not desired.

History:
r7: per Sean Finney's review: simpler lat/long rounding,
    use stat() not access() to check existence of timezone,
    improve comments throughout.
r6: fix fd leak in r5, fix country code/BC flag use in 
    timezone_identifiers_list() using system db,
    fix use of PECL timezonedb to override system db,
r5: reverts addition of "System/Localtime" fake tzname.
    updated for 5.3.0, parses zone.tab to pick up mapping between
    timezone name, country code and long/lat coords
r4: added "System/Localtime" tzname which uses /etc/localtime
r3: fix a crash if /usr/share/zoneinfo doesn't exist (Raphael Geissert)
r2: add filesystem trawl to set up name alias index
r1: initial revision

--- php-5.3.0/ext/date/lib/parse_tz.c.systzdata
+++ php-5.3.0/ext/date/lib/parse_tz.c
@@ -20,6 +20,16 @@
 
 #include "timelib.h"
 
+#ifdef HAVE_SYSTEM_TZDATA
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "php_scandir.h"
+#endif
+
 #include <stdio.h>
 
 #ifdef HAVE_LOCALE_H
@@ -31,7 +41,12 @@
 #else
 #include <strings.h>
 #endif
+
+#ifndef HAVE_SYSTEM_TZDATA
 #include "timezonedb.h"
+#endif
+
+#include <ctype.h>
 
 #if (defined(__APPLE__) || defined(__APPLE_CC__)) && (defined(__BIG_ENDIAN__) 
|| defined(__LITTLE_ENDIAN__))
 # if defined(__LITTLE_ENDIAN__)
@@ -51,6 +66,11 @@
 
 static void read_preamble(const unsigned char **tzf, timelib_tzinfo *tz)
 {
+       if (memcmp(tzf, "TZif", 4) == 0) {
+               *tzf += 20;
+               return;
+       }
+    
        /* skip ID */
        *tzf += 4;
        
@@ -253,7 +273,435 @@ void timelib_dump_tzinfo(timelib_tzinfo 
        }
 }
 
-static int seek_to_tz_position(const unsigned char **tzf, char *timezone, 
const timelib_tzdb *tzdb)
+#ifdef HAVE_SYSTEM_TZDATA
+
+#ifdef HAVE_SYSTEM_TZDATA_PREFIX
+#define ZONEINFO_PREFIX HAVE_SYSTEM_TZDATA_PREFIX
+#else
+#define ZONEINFO_PREFIX "/usr/share/zoneinfo"
+#endif
+
+/* Hash table entry for the cache of the zone.tab mapping table. */
+struct location_info {
+       char code[2]; /* Country code. */
+       double latitude, longitude;
+       char name[64];
+       char *comment;
+       struct location_info *next;
+};
+
+/* System timezone database pointer. */
+static const timelib_tzdb *timezonedb_system = NULL;
+
+/* Cache of zone.tab location data. */
+static struct location_info **system_location_table;
+
+/* Size of the zone.tab hash table; a random-ish prime big enough to
+ * prevent too many collisions. */
+#define LOCINFO_HASH_SIZE (1021)
+
+/* Hash function for indexing the location_info hash table. */
+static uint32_t tz_hash(const char *str)
+{
+       const unsigned char *p = (const unsigned char *)str;
+       uint32_t hash = 5381;
+       int c;
+       
+       while ((c = *p++) != '\0') {
+               hash = (hash << 5) ^ hash ^ c;
+       }
+       
+       return hash % LOCINFO_HASH_SIZE;
+}
+
+/* Parse an ISO-6709 co-ordinate as used in zone.tab. Returns end of
+ * the parsed string on success, or NULL on parse error.  On success,
+ * writes the parsed number to *result. */
+static char *parse_iso6709(char *p, double *result)
+{
+       double v, sign;
+       char *pend;
+       size_t len;
+
+       if (*p == '+')
+               sign = 1.0;
+       else if (*p == '-')
+               sign = -1.0;
+       else
+               return NULL;
+
+       p++;
+       for (pend = p; *pend >= '0' && *pend <= '9'; pend++)
+               ;;
+
+       /* Annoying encoding used by zone.tab has no decimal point, so use
+        * the length to determine the format:
+        * 
+        * 4 = DDMM
+        * 5 = DDDMM
+        * 6 = DDMMSS
+        * 7 = DDDMMSS
+        */
+       len = pend - p;
+       if (len < 4 || len > 7) {
+               return NULL;
+       }
+
+       /* p => [D]DD */
+       v = (p[0] - '0') * 10.0 + (p[1] - '0');
+       p += 2;
+       if (len == 5 || len == 7)
+               v = v * 10.0 + (*p++ - '0');
+       /* p => MM[SS] */
+       v += (10.0 * (p[0] - '0')
+                 + p[1] - '0') / 60.0;
+       p += 2;
+       /* p => [SS] */
+       if (len > 5) {
+               v += (10.0 * (p[0] - '0')
+                         + p[1] - '0') / 3600.0;
+               p += 2;
+       }
+
+       /* Round to five decimal place, not because it's a good idea,
+        * but, because the builtin data uses rounded data, so, match
+        * that. */
+       *result = sign * (int)(v * 100000.0 + 0.5) / 100000.0;
+
+       return p;
+}
+
+/* This function parses the zone.tab file to build up the mapping of
+ * timezone to country code and geographic location, and returns a
+ * hash table. The hash table is indexed by the function:
+ *
+ *      tz_hash(timezone-name)
+ */
+static struct location_info **create_location_table(void)
+{
+       struct location_info **li, *i;
+       char zone_tab[PATH_MAX];
+       char line[512];
+       FILE *fp;
+
+       strncpy(zone_tab, ZONEINFO_PREFIX "/zone.tab", sizeof zone_tab);
+
+       fp = fopen(zone_tab, "r");
+       if (!fp) {
+               return NULL;
+       }
+
+       li = calloc(LOCINFO_HASH_SIZE, sizeof *li);
+
+       while (fgets(line, sizeof line, fp)) {
+               char *p = line, *code, *name, *comment;
+               uint32_t hash;
+               double latitude, longitude;
+
+               while (isspace(*p))
+                       p++;
+
+               if (*p == '#' || *p == '\0' || *p == '\n')
+                       continue;
+               
+               if (!isalpha(p[0]) || !isalpha(p[1]) || p[2] != '\t')
+                       continue;
+               
+               /* code => AA */
+               code = p;
+               p[2] = 0;
+               p += 3;
+
+               /* coords => [+-][D]DDMM[SS][+-][D]DDMM[SS] */
+               p = parse_iso6709(p, &latitude);
+               if (!p) {
+                       continue;
+               }
+               p = parse_iso6709(p, &longitude);
+               if (!p) {
+                       continue;
+               }
+
+               if (!p || *p != '\t') {
+                       continue;
+               }
+
+               /* name = string */
+               name = ++p;
+               while (*p != '\t' && *p && *p != '\n')
+                       p++;
+
+               *p++ = '\0';
+
+               /* comment = string */
+               comment = p;
+               while (*p != '\t' && *p && *p != '\n')
+                       p++;
+
+               if (*p == '\n' || *p == '\t')
+                       *p = '\0';
+               
+               hash = tz_hash(name);
+               i = malloc(sizeof *i);
+               memcpy(i->code, code, 2);
+               strncpy(i->name, name, sizeof i->name);
+               i->comment = strdup(comment);
+               i->longitude = longitude;
+               i->latitude = latitude;
+               i->next = li[hash];
+               li[hash] = i;
+               /* printf("%s [%u, %f, %f]\n", name, hash, latitude, 
longitude); */
+       }
+
+       fclose(fp);
+
+       return li;
+}
+
+/* Return location info from hash table, using given timezone name.
+ * Returns NULL if the name could not be found. */
+const struct location_info *find_zone_info(struct location_info **li, 
+                                                                               
   const char *name)
+{
+       uint32_t hash = tz_hash(name);
+       const struct location_info *l;
+
+       if (!li) {
+               return NULL;
+       }
+
+       for (l = li[hash]; l; l = l->next) {
+               if (strcasecmp(l->name, name) == 0)
+                       return l;
+       }
+
+       return NULL;
+}       
+
+/* Filter out some non-tzdata files and the posix/right databases, if
+ * present. */
+static int index_filter(const struct dirent *ent)
+{
+       return strcmp(ent->d_name, ".") != 0
+               && strcmp(ent->d_name, "..") != 0
+               && strcmp(ent->d_name, "posix") != 0
+               && strcmp(ent->d_name, "posixrules") != 0
+               && strcmp(ent->d_name, "right") != 0
+               && strstr(ent->d_name, ".tab") == NULL;
+}
+
+/* Comparison callback for qsort(), used to alpha-sort the index
+ * array by timezone name. */
+static int sysdbcmp(const void *first, const void *second)
+{
+       const timelib_tzdb_index_entry *alpha = first, *beta = second;
+       
+       return strcmp(alpha->id, beta->id);
+}
+
+
+/* Create the zone identifier index by trawling the filesystem. */
+static void create_zone_index(timelib_tzdb *db)
+{
+       size_t dirstack_size,  dirstack_top;
+       size_t index_size, index_next;
+       timelib_tzdb_index_entry *db_index;
+       char **dirstack;
+
+       /* LIFO stack to hold directory entries to scan; each slot is a
+        * directory name relative to the zoneinfo prefix. */
+       dirstack_size = 32;
+       dirstack = malloc(dirstack_size * sizeof *dirstack);
+       dirstack_top = 1;
+       dirstack[0] = strdup("");
+       
+       /* Index array. */
+       index_size = 64;
+       db_index = malloc(index_size * sizeof *db_index);
+       index_next = 0;
+
+       do {
+               struct dirent **ents;
+               char name[PATH_MAX], *top;
+               int count;
+
+               /* Pop the top stack entry, and iterate through its contents. */
+               top = dirstack[--dirstack_top];
+               snprintf(name, sizeof name, ZONEINFO_PREFIX "/%s", top);
+
+               count = php_scandir(name, &ents, index_filter, php_alphasort);
+
+               while (count > 0) {
+                       struct stat st;
+                       const char *leaf = ents[count - 1]->d_name;
+
+                       snprintf(name, sizeof name, ZONEINFO_PREFIX "/%s/%s", 
+                                top, leaf);
+                       
+                       if (strlen(name) && stat(name, &st) == 0) {
+                               /* Name, relative to the zoneinfo prefix. */
+                               const char *root = top;
+
+                               if (root[0] == '/') root++;
+
+                               snprintf(name, sizeof name, "%s%s%s", root, 
+                                        *root ? "/": "", leaf);
+
+                               if (S_ISDIR(st.st_mode)) {
+                                       if (dirstack_top == dirstack_size) {
+                                               dirstack_size *= 2;
+                                               dirstack = realloc(dirstack, 
+                                                                               
   dirstack_size * sizeof *dirstack);
+                                       }
+                                       dirstack[dirstack_top++] = strdup(name);
+                               }
+                               else {
+                                       if (index_next == index_size) {
+                                               index_size *= 2;
+                                               db_index = realloc(db_index,
+                                                                               
   index_size * sizeof *db_index);
+                                       }
+
+                                       db_index[index_next++].id = 
strdup(name);
+                               }
+                       }
+
+                       free(ents[--count]);
+               }
+               
+               if (count != -1) free(ents);
+               free(top);
+       } while (dirstack_top);
+
+       /* Alpha-sort the index array; shouldn't be technically necessary
+        * but some of the test cases rely on this, and, it matches the
+        * builtin database. */
+       qsort(db_index, index_next, sizeof *db_index, sysdbcmp);
+       
+       db->index = db_index;
+       db->index_size = index_next;
+
+       free(dirstack);
+}
+
+#define FAKE_HEADER "1234\0??\1??"
+#define FAKE_BC_POS (0)
+#define FAKE_UTC_POS (7 - 4)
+
+/* Create a fake data segment for database 'sysdb'.   This mocks
+ * up a fake ->data segment for the given timezone database. 
+ * php_date.c::timezone_identifiers_list() looks at data[pos + 4]
+ * through data[pos + 6] to compare the country code and BC flag, 
+ * which are stored in the builtin data array like:
+ *
+ *    (pos + 4) => BC flag
+ *    (pos + 5, pos + 6) => Two chars of country code
+ *
+ * where pos is the index corresponding to the timezone name.
+ *
+ * Timezone names are classified here into three types:
+ *    1) UTC, which is special
+ *    2) "normal" zone names
+ *    3) "backwards-compat" zone names
+ *
+ * (boolean logic of the BC flag seems to be inverted, but hey)
+ *
+ * UTC is special since it has BC=\1, code = "??"
+ * "normal" zones exist in zone.tab and have the given c-code and BC=\1
+ * "backwards-compat" zones don't exist in zone.tab and have BC=\0
+ *
+ * Since UTC and the BC zones are constant, they are encoded in the
+ * FAKE_HEADER prefix, and pos pointers index into that.
+ *
+ * FAKE_HEADER is hence four random bytes, then the BC zone segment
+ * (three bytes), then the UTC zone segment (another three).
+ *
+ * For all "normal" zones, three bytes are appended to the data array;
+ * the BC flag, always 1, and the two bytes of country code.
+ */
+static void fake_data_segment(timelib_tzdb *sysdb,
+                                                         struct location_info 
**info)
+{
+       size_t n;
+       char *data, *p;
+       
+       /* Worst case maximum is 3 bytes per zone, plus the header. */
+       data = malloc((3 * sysdb->index_size) + sizeof(FAKE_HEADER) - 1);
+       
+       /* Append the fake header, p then = next byte */
+       p = mempcpy(data, FAKE_HEADER, sizeof(FAKE_HEADER) - 1);
+       
+       for (n = 0; n < sysdb->index_size; n++) {
+               const struct location_info *li;
+               timelib_tzdb_index_entry *ent;
+               
+               /* Lost const'ness since we're modifying the pos pointer. */
+               ent = (timelib_tzdb_index_entry *)&sysdb->index[n];
+               
+               /* Lookup the timezone name in the hash table. */
+               if (strcmp(ent->id, "UTC") == 0) {
+                       ent->pos = FAKE_UTC_POS;
+                       continue;
+               }
+               
+               li = find_zone_info(info, ent->id);
+               if (li) {
+                       /* If found, append the BC byte and the country code; 
set
+                        * the position index for the timezone to point to
+                        * this.  */
+                       ent->pos = (p - data) - 4;
+                       *p++ = '\x01';
+                       *p++ = li->code[0];
+                       *p++ = li->code[1];
+               }
+               else {
+                       /* If not found, the timezone data can
+                        * point at the header. */
+                       ent->pos = 0;
+               }
+       }
+       
+       /* Store the fake data array */
+       sysdb->data = (unsigned char *)data;
+}
+
+/* Evaluates to true if given timezone name is valid. */
+#define is_valid_tz_name(tz_) (tz_[0] && strstr(tz_, "..") == NULL)
+
+/* Return the mmap()ed tzfile if found, else NULL.     On success, the
+ * length of the mapped data is placed in *length. */
+static char *map_tzfile(const char *timezone, size_t *length)
+{
+       char fname[PATH_MAX];
+       struct stat st;
+       char *p;
+       int fd;
+       
+       if (!is_valid_tz_name(timezone)) {
+               return NULL;
+       }
+
+       snprintf(fname, sizeof fname, ZONEINFO_PREFIX "/%s", timezone);
+       
+       fd = open(fname, O_RDONLY);
+       if (fd == -1) {
+               return NULL;
+       } else if (fstat(fd, &st) != 0 || st.st_size < 21) {
+               close(fd);
+               return NULL;
+       }
+
+       *length = st.st_size;
+       p = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0);
+       close(fd);
+       
+       return p != MAP_FAILED ? p : NULL;
+}
+#endif
+
+/* seek_to_tz_position() for a builtin/external database. */
+static int inmem_seek_to_tz_position(const unsigned char **tzf, 
+                                                                        char 
*timezone, const timelib_tzdb *tzdb)
 {
        int left = 0, right = tzdb->index_size - 1;
 #ifdef HAVE_SETLOCALE
@@ -292,36 +740,131 @@ static int seek_to_tz_position(const uns
        return 0;
 }
 
+/* Modified seek_to_tz_position wrapper which handles the system
+ * database and the builtin/external databases in the same way.
+ * Returns zero on failure on non-zero on success.  On success, (*map,
+ * *maplen) is an mmap'ed region if *map is non-NULL, and must be
+ * munmaped after use.  */
+static int seek_to_tz_position(const unsigned char **tzf, char *timezone, 
+                                                          char **map, size_t 
*maplen,
+                                                          const timelib_tzdb 
*tzdb)
+{
+#ifdef HAVE_SYSTEM_TZDATA
+       if (tzdb == timezonedb_system) {
+               char *orig;
+
+               orig = map_tzfile(timezone, maplen);
+               if (orig == NULL) {
+                       return 0;
+               }
+               
+               (*tzf) = (unsigned char *)orig ;
+               *map = orig;
+                               
+               return 1;
+       }
+       else 
+#endif
+       {
+               return inmem_seek_to_tz_position(tzf, timezone, tzdb);
+       }
+}
+
 const timelib_tzdb *timelib_builtin_db(void)
 {
+#ifdef HAVE_SYSTEM_TZDATA
+       if (timezonedb_system == NULL) {
+               timelib_tzdb *tmp = malloc(sizeof *tmp);
+
+               tmp->version = "0.system";
+               tmp->data = NULL;
+               create_zone_index(tmp);
+               system_location_table = create_location_table();
+               fake_data_segment(tmp, system_location_table);
+               timezonedb_system = tmp;
+       }
+                       
+       return timezonedb_system;
+#else
        return &timezonedb_builtin;
+#endif
 }
 
 const timelib_tzdb_index_entry *timelib_timezone_builtin_identifiers_list(int 
*count)
 {
+#ifdef HAVE_SYSTEM_TZDATA
+       *count = timezonedb_system->index_size;
+       return timezonedb_system->index;
+#else
        *count = sizeof(timezonedb_idx_builtin) / 
sizeof(*timezonedb_idx_builtin);
        return timezonedb_idx_builtin;
+#endif
 }
 
 int timelib_timezone_id_is_valid(char *timezone, const timelib_tzdb *tzdb)
 {
        const unsigned char *tzf;
-       return (seek_to_tz_position(&tzf, timezone, tzdb));
+
+#ifdef HAVE_SYSTEM_TZDATA
+       if (tzdb == timezonedb_system) {
+               char fname[PATH_MAX];
+               struct stat st;
+               
+               if (!is_valid_tz_name(timezone)) {
+                       return 0;
+               }
+        
+               snprintf(fname, sizeof fname, ZONEINFO_PREFIX "/%s", timezone);
+        
+               return stat(fname, &st) == 0 && S_ISREG(st.st_mode);
+       }
+#endif
+
+       return (inmem_seek_to_tz_position(&tzf, timezone, tzdb));
 }
 
 timelib_tzinfo *timelib_parse_tzfile(char *timezone, const timelib_tzdb *tzdb)
 {
        const unsigned char *tzf;
+       char *memmap = NULL;
+       size_t maplen;
        timelib_tzinfo *tmp;
 
-       if (seek_to_tz_position(&tzf, timezone, tzdb)) {
+       if (seek_to_tz_position(&tzf, timezone, &memmap, &maplen, tzdb)) {
                tmp = timelib_tzinfo_ctor(timezone);
 
                read_preamble(&tzf, tmp);
                read_header(&tzf, tmp);
                read_transistions(&tzf, tmp);
                read_types(&tzf, tmp);
-               read_location(&tzf, tmp);
+
+#ifdef HAVE_SYSTEM_TZDATA
+               if (memmap) {
+                       const struct location_info *li;
+                       
+                       /* TZif-style - grok the location info from the system 
database,
+                        * if possible. */
+                       if ((li = find_zone_info(system_location_table, 
timezone)) != NULL) {
+                               tmp->location.comments = strdup(li->comment);
+                               strncpy(tmp->location.country_code, li->code, 
2);
+                               tmp->location.longitude = li->longitude;
+                               tmp->location.latitude = li->latitude;
+                               tmp->bc = 1;
+                       }
+                       else {
+                               strcpy(tmp->location.country_code, "??");
+                               tmp->bc = 0;
+                               tmp->location.comments = strdup("");
+                       }
+
+                       /* Now done with the mmap segment - discard it. */
+                       munmap(memmap, maplen);
+#endif
+               }
+               else {
+                       /* PHP-style - use the embedded info. */
+                       read_location(&tzf, tmp);
+               }
        } else {
                tmp = NULL;
        }
--- php-5.3.0/ext/date/lib/timelib.m4.systzdata
+++ php-5.3.0/ext/date/lib/timelib.m4
@@ -78,3 +78,17 @@ stdlib.h
 
 dnl Check for strtoll, atoll
 AC_CHECK_FUNCS(strtoll atoll strftime)
+
+PHP_ARG_WITH(system-tzdata, for use of system timezone data,
+[  --with-system-tzdata[=DIR]      to specify use of system timezone data],
+no, no)
+
+if test "$PHP_SYSTEM_TZDATA" != "no"; then
+   AC_DEFINE(HAVE_SYSTEM_TZDATA, 1, [Define if system timezone data is used])
+
+   if test "$PHP_SYSTEM_TZDATA" != "yes"; then
+      AC_DEFINE_UNQUOTED(HAVE_SYSTEM_TZDATA_PREFIX, "$PHP_SYSTEM_TZDATA",
+                         [Define for location of system timezone data])
+   fi
+fi
+

Reply via email to