It has been suggested to me that I resubmit this question to this list,
rather than the GENERAL list it was originaly sent to.

   I asked earlier about ways of doing an UPDATE involving a left outer
join and got some very useful feedback.

   This has thrown up a (to me) strange anomaly about the speed of such
an update.

   The input to this query is a fairly large (the example I'm working
with has 335,000 rows) set of records containing numbers to be looked
up in the lookup table.  This lookup table has 239 rows.

   I'm always reading the suggestion that doing a 'VACUUM ANALYZE' on a
database is 'A Good Thing' as it helps the planner to do the best thing, so
I arranged a vacuum analyze on the input records.

   Running the query takes about 13 mins or so.

   If, however I *don't* do an analyze, but leave the input table as
it was when imported the run takes about 2.5 mins!

   Looking at the output from 'explain' I can see that the main difference
in the way the planner does it is that it does a merge join in the non-analyze
case, and a hash join in the analyze case.

   Unfortunately I don't really know what this is implying, hence the call
for assistance.

   I have a file with all sorts of info about the problem (details of tables,
output of 'explain' etc) but as it is about 5K in size, and wide as well, I
didn't want to dump it in the list without any warning!

   However - it has been suggested that it should be OK to include this I have
now done so - hopefully with this message.

   Regards,
       Harry.

select version();
                               version                               
---------------------------------------------------------------------
 PostgreSQL 7.3.4 on i386-unknown-freebsd4.5, compiled by GCC 2.95.3

create table num_xlate (interim_num varchar(30) not null , num varchar(30) not null, 
starttime timestamp with time zone not null, endtime timestamp with time zone not 
null, constraint num_pos_dur check (endtime >= starttime), primary key (interim_num, 
starttime));

create table unrated_cdrs (cdr_id bigserial unique, interim_cli varchar(30), 
interim_tli varchar(30), cli varchar(30), tli varchar(30));
CREATE TABLE
copy unrated_cdrs (interim_cli, interim_tli) from '/data/swipe/bin/mt2.csv' with 
delimiter as ',';
COPY
explain update unrated_cdrs set cli = coalesce(b.num, unrated_cdrs.interim_cli) from 
(unrated_cdrs as un left outer join num_xlate on (un.interim_cli = interim_num and 
un.starttime between num_xlate.starttime and num_xlate.endtime)) as b where 
unrated_cdrs.cdr_id = b.cdr_id;
                                                    QUERY PLAN                         
                           
------------------------------------------------------------------------------------------------------------------
 Merge Join  (cost=286.99..358.99 rows=1000 width=393)
   Merge Cond: ("outer".cdr_id = "inner".cdr_id)
   ->  Index Scan using unrated_cdrs_cdr_id_key on unrated_cdrs  (cost=0.00..52.00 
rows=1000 width=262)
   ->  Sort  (cost=286.99..289.49 rows=1000 width=131)
         Sort Key: un.cdr_id
         ->  Merge Join  (cost=139.66..237.16 rows=1000 width=131)
               Merge Cond: ("outer".interim_cli = "inner".interim_num)
               Join Filter: (("outer".starttime >= "inner".starttime) AND 
("outer".starttime <= "inner".endtime))
               ->  Sort  (cost=69.83..72.33 rows=1000 width=49)
                     Sort Key: un.interim_cli
                     ->  Seq Scan on unrated_cdrs un  (cost=0.00..20.00 rows=1000 
width=49)
               ->  Sort  (cost=69.83..72.33 rows=1000 width=82)
                     Sort Key: num_xlate.interim_num
                     ->  Seq Scan on num_xlate  (cost=0.00..20.00 rows=1000 width=82)
(14 rows)

update unrated_cdrs set cli = coalesce(b.num, unrated_cdrs.interim_cli) from 
(unrated_cdrs as un left outer join num_xlate on (un.interim_cli = interim_num and 
un.starttime between num_xlate.starttime and num_xlate.endtime)) as b where 
unrated_cdrs.cdr_id = b.cdr_id;
UPDATE 335671
update unrated_cdrs set tli = coalesce(b.num, unrated_cdrs.interim_tli) from 
(unrated_cdrs as un left outer join num_xlate on (un.interim_tli = interim_num and 
un.starttime between num_xlate.starttime and num_xlate.endtime)) as b where 
unrated_cdrs.cdr_id = b.cdr_id;
UPDATE 335671
        2m57.37s real           0.00s user              0.00s sys

DROP TABLE
create table unrated_cdrs (cdr_id bigserial unique, interim_cli varchar(30), 
interim_tli varchar(30), cli varchar(30), tli varchar(30));
CREATE TABLE
copy unrated_cdrs (interim_cli, interim_tli) from '/data/swipe/bin/mt2.csv' with 
delimiter as ',';
COPY
vacuum analyze unrated_cdrs;
VACUUM
explain update unrated_cdrs set cli = coalesce(b.num, unrated_cdrs.interim_cli) from 
(unrated_cdrs as un left outer join num_xlate on (un.interim_cli = interim_num and 
un.starttime between num_xlate.starttime and num_xlate.endtime)) as b where 
unrated_cdrs.cdr_id = b.cdr_id;
                                                    QUERY PLAN                         
                           
------------------------------------------------------------------------------------------------------------------
 Hash Join  (cost=67773.77..112554.43 rows=335671 width=343)
   Hash Cond: ("outer".cdr_id = "inner".cdr_id)
   ->  Seq Scan on unrated_cdrs  (cost=0.00..8832.71 rows=335671 width=229)
   ->  Hash  (cost=61197.59..61197.59 rows=335671 width=114)
         ->  Merge Join  (cost=58661.58..61197.59 rows=335671 width=114)
               Merge Cond: ("outer".interim_cli = "inner".interim_num)
               Join Filter: (("outer".starttime >= "inner".starttime) AND 
("outer".starttime <= "inner".endtime))
               ->  Sort  (cost=58591.75..59430.93 rows=335671 width=32)
                     Sort Key: un.interim_cli
                     ->  Seq Scan on unrated_cdrs un  (cost=0.00..8832.71 rows=335671 
width=32)
               ->  Sort  (cost=69.83..72.33 rows=1000 width=82)
                     Sort Key: num_xlate.interim_num
                     ->  Seq Scan on num_xlate  (cost=0.00..20.00 rows=1000 width=82)
(13 rows)

update unrated_cdrs set cli = coalesce(b.num, unrated_cdrs.interim_cli) from 
(unrated_cdrs as un left outer join num_xlate on (un.interim_cli = interim_num and 
un.starttime between num_xlate.starttime and num_xlate.endtime)) as b where 
unrated_cdrs.cdr_id = b.cdr_id;
UPDATE 335671
update unrated_cdrs set tli = coalesce(b.num, unrated_cdrs.interim_tli) from 
(unrated_cdrs as un left outer join num_xlate on (un.interim_tli = interim_num and 
un.starttime between num_xlate.starttime and num_xlate.endtime)) as b where 
unrated_cdrs.cdr_id = b.cdr_id;
UPDATE 335671
        13m43.74s real          0.00s user              0.00s sys
---------------------------(end of broadcast)---------------------------
TIP 4: Don't 'kill -9' the postmaster

Reply via email to