Re: [sqlite] single table data collapse with constraints

Keith Medcalf Wed, 29 Jan 2020 11:25:17 -0800

Excuse the top posting.  This perhaps:

create table srcdata
(
    CLS1    text not null,
    CLS2    integer not null,
    START   integer not null,
    END     integer not null
);
insert into srcdata values ('ABC1',100,0,1);
insert into srcdata values ('ABC1',100,1,1);
insert into srcdata values ('ABC1',100,1,3);
insert into srcdata values ('ABC1',100,1,3);
insert into srcdata values ('ABC1',100,3,4);
insert into srcdata values ('ABC1',100,4,3);
insert into srcdata values ('ABC1',100,5,6);
insert into srcdata values ('ABC1',100,6,20);
insert into srcdata values ('ABC1',100,6,20);
insert into srcdata values ('ABC1',500,4,19);
insert into srcdata values ('ABC1',500,19,4);
insert into srcdata values ('ABC2',300,4,4);


sqlite> select * from srcdata;
CLS1        CLS2        START       END
----------  ----------  ----------  ----------
ABC1        100         0           1
ABC1        100         1           1
ABC1        100         1           3
ABC1        100         1           3
ABC1        100         3           4
ABC1        100         4           3
ABC1        100         5           6
ABC1        100         6           20
ABC1        100         6           20
ABC1        500         4           19
ABC1        500         19          4
ABC2        300         4           4

with src (cls1, cls2, start, end)
  as (
      select distinct cls1, cls2, min(start, end), max(start, end)
        from srcdata
     ),
     c (cls1, cls2, start, end)
  as (
         select cls1, cls2, start, end
           from src
      union
         select c.cls1, c.cls2, s.start, c.end
           from src as s, c
          where c.cls1 == s.cls1
            and c.cls2 == s.cls2
            and c.start == s.end
            and s.start != c.start
     ),
     u (cls1, cls2, start, end)
  as (
      select cls1, cls2, start, end
        from c as o
       where not exists (
                         select *
                           from c
                          where cls1 == o.cls1
                            and cls2 == o.cls2
                            and (   (
                                         o.start between start + 1 and end
                                     and o.start between start and end - 1
                                    )
                                 or (
                                         o.end between start + 1 and end
                                     and o.end between start and end - 1
                                    )
                                )
                        )
     )
select *
  from u
order by cls1, cls2, start, end
;

cls1        cls2        start       end
----------  ----------  ----------  ----------
ABC1        100         0           4
ABC1        100         5           20
ABC1        500         4           19
ABC2        300         4           4

So in the CTE 
 "src" makes sure that start < end and removes duplicate rows
 "c" finds and adds coalesced range rows
 "u" finds the rows in c where there is no containing row

If you have any significant amount of data the following procedure is probably 
many many times faster since the various bits of the single CTE version are not 
materialized (though if you can figure out how to force that, it will do the 
same thing):

drop table if exists temp.src;
drop table if exists temp.coal;

create temporary table src
(
    cls1    text not null,
    cls2    text not null,
    start   integer not null,
    end     integer not null
);
insert into temp.src
    select distinct cls1, cls2, min(start, end) as start, max(start, end) as end
      from srcdata
;
create index temp.idx_src on src (cls1, cls2, end);

create temporary table coal
(
    cls1    text not null,
    cls2    text not null,
    start   integer not null,
    end     integer not null
);
insert into temp.coal
    with c
      as (
             select cls1, cls2, start, end
               from temp.src
          union
             select c.cls1, c.cls2, s.start, c.end
               from temp.src as s, c
              where c.cls1 == s.cls1
                and c.cls2 == s.cls2
                and c.start == s.end
                and s.start != c.start
         )
    select cls1, cls2, start, end
      from c
;
create index temp.idx_coal on coal (cls1, cls2);

  select cls1, cls2, start, end
    from temp.coal as o
   where not exists (
                     select 1
                       from temp.coal as c
                      where cls1 == o.cls1
                        and cls2 == o.cls2
                        and (   (
                                     o.start between start + 1 and end
                                 and o.start between start and end - 1
                                )
                             or (
                                     o.end between start + 1 and end
                                 and o.end between start and end - 1
                                )
                            )
                    )
order by cls1, cls2, start, end
;

drop table if exists temp.src;
drop table if exists temp.coal;

-- 
The fact that there's a Highway to Hell but only a Stairway to Heaven says a 
lot about anticipated traffic volume.

>-----Original Message-----
>From: sqlite-users <sqlite-users-boun...@mailinglists.sqlite.org> On
>Behalf Of Golding Robert
>Sent: Wednesday, 29 January, 2020 08:02
>To: sqlite-users@mailinglists.sqlite.org
>Subject: [sqlite] single table data collapse with constraints
>
>Hi,
>
>I am emailing to ask if someone could advise me how to create a query or
>queries which will collapse some data based on a limited number of
>constraints.
>I am currently attempting to complete this task using DB Browser for
>SQLite. I  have tried to write a WITH RECLUSIVE statement as I think this
>is the requirement but am struggling with both the abstraction and
>syntax.
>
>
>I have data contained with a single table of structure:
>CLS1 field (text)
>CLS2 field (integer)
>START field (integer)
>END field (integer
>
>I need to collapse the data based on the matching of values in fields
>CLS1 and CLS2; the final constraint is that if END and START values are
>continuous of the another record then they can be collapsed. Therefore
>records should only be collapsed if gaps do not exist. The results then
>need to written to a new table, leaving the original data as is.
>
>Input data: assumptions
>
>  1.  Data may or may not be ordered
>  2.  Duplicates may or may not exist
>  3.  Start and end values could be the same
>  4.  Start values are normally lower that the end value, however the
>high value could be in the start field
>  5.  Assume that there is no overlap in terms of start and end values
>(namely if they can be joined then one will stop where the next starts)
>
>Input data:  example
>CLS1,CLS2,START,END
>ABC1,100,0,1
>ABC1,100,1,1 (start and end values could be the same, in the first
>instance assume that they may be dissolved if possible, if they cannot
>the record need to be retained)
>ABC1,100,1,3
>ABC1,100,1,3 (duplicates may or may not be present, if present then they
>can be dissolved into a single instance)
>ABC1,100,3,4
>ABC1,100,4,3
>ABC1,100,5,6
>ABC1,100,6,20
>ABC1,100,6,20(duplicates may or may not be present, if present then they
>can be dissolved into a single instance)
>ABC1,500,4,19
>ABC1,500,19,4 (start and end values could be inverted where Start is high
>and End id low (this is not the norm but it is legitimate) in this case
>start and end values may be inverted, and in this case dissolved into a
>single instance)
>ABC2,300,4,4 (start and end values could be the same, in the first
>instance assume that they may be dissolved if possible, if they cannot
>the record need to be retained)
>
>
>
>Output data: collapsed/merged expected output
>CLS1,CLS2,START,END
>ABC1,100,0,4
>ABC1,100,5,20
>ABC1,500,4,19
>ABC2,300,4,4
>
>
>I would be extremely grateful if anybody could help me with this issue.
>
>
>Regards,
>
>Rob
>
>
>
>*************************************************************************
>*************************************************************************
>**************
>
>The content of this email (and any attachment) is confidential. It may
>also be legally privileged or otherwise protected from disclosure.
>This email should not be used by anyone who is not an original intended
>recipient, nor may it be copied or disclosed to anyone who is not an
>original intended recipient.
>
>If you have received this email by mistake please notify us by emailing
>the sender, and then delete the email and any copies from your system.
>
>Liability cannot be accepted for statements made which are clearly the
>sender's own and not made on behalf of Network Rail.
>Network Rail Infrastructure Limited registered in England and Wales No.
>2904587, registered office Network Rail, 2nd Floor, One Eversholt Street,
>London, NW1 2DN
>
>*************************************************************************
>*************************************************************************
>**************
>_______________________________________________
>sqlite-users mailing list
>sqlite-users@mailinglists.sqlite.org
>http://mailinglists.sqlite.org/cgi-bin/mailman/listinfo/sqlite-users



_______________________________________________
sqlite-users mailing list
sqlite-users@mailinglists.sqlite.org
http://mailinglists.sqlite.org/cgi-bin/mailman/listinfo/sqlite-users

Re: [sqlite] single table data collapse with constraints

Reply via email to