emacs-config/etc/org-sql/viz_setup.sql

create or replace function to_localtime(ts timestamp with time zone)
returns time with time zone
as $$
  select ts::time at time zone 'US/Eastern';
$$ language SQL;

create or replace function get_bin_num(x numeric, width int)
returns int
as $$
  -- select (floor(x/width)*width)::int;
  select floor(x/width)::int;
$$ language SQL;

CREATE OR REPLACE FUNCTION bin_ranges(tname text, cname text, width int)
RETURNS TABLE(bin_number int, bin_interval text)
AS $func$
BEGIN
RETURN QUERY EXECUTE format('
  with
    bin_numbers as (
      select
      generate_series(
        min(get_bin_num(%1$s,%2$s)),
        max(get_bin_num(%1$s,%2$s))
      ) as bin_number
      from %3$s
    )

  select
    bin_number,
    concat(bin_number*%2$s, ''-'', bin_number*%2$s + %2$s - 1) as bin_interval
    from bin_numbers
    order by bin_number;',
  cname,
  width,
  tname
  );
END
$func$ LANGUAGE plpgsql;

CREATE OR REPLACE FUNCTION histogram(tname text, cname text, width int)
RETURNS TABLE(bin_number int, bin_interval text, count bigint)
AS $func$
BEGIN
RETURN QUERY EXECUTE format('
  with binned as (
      select
        get_bin_num(%1$s,%2$s) as ab_floor,
        count(*) as count
      from %3$s a
      group by ab_floor
      order by ab_floor
  )

  select
    bin_number,
    bin_interval,
    case when b.count is null then 0 else b.count end as count
    from bin_ranges(''%3$s'',''%1$s'',%2$s) r
    left join binned b on r.bin_number=b.ab_floor
    order by r.bin_number;',
  cname,
  width,
  tname
  );
END
$func$ LANGUAGE plpgsql;

create or replace procedure make_vis_tables()
language plpgsql
as $$

declare
begin

create schema if not exists viz;

create temporary table _scheduled_timestamps as
select ts.* from planning_entries pe
  join timestamps ts using (timestamp_id)
  where
    pe.planning_type = 'scheduled';

create temporary table _deadline_timestamps as
select ts.* from planning_entries pe
  join timestamps ts using (timestamp_id)
  where
    pe.planning_type = 'deadline';

create temporary table _closed_timestamps as
select ts.* from planning_entries pe
  join timestamps ts using (timestamp_id)
  where
    pe.planning_type = 'closed';

create temporary table _tags as
select hc.headline_id, t.tag from headline_tags t
  join headline_closures hc on hc.parent_id = t.headline_id
  union
  select h.headline_id, t.tag from file_tags t
  join headlines h on h.outline_hash = t.outline_hash;

create temporary table _category_tags as
select distinct * from _tags t where t.tag like '\__%';

-- TODO there is a small chance that headlines might have two context tags
-- if different contexts are explicitly specified on two headlines at different
-- levels
create temporary table _context_tags as
select distinct * from _tags t where t.tag like '@_%';

create temporary table _resource_tags as
select distinct t.headline_id, array_agg(t.tag) as tags
  from _tags t
  where
    t.tag like '#_%'
  group by t.headline_id;

create temporary table _incubated_headlines as
select distinct * from _tags t
  where
    t.tag = '%inc';

create temporary table _other_tags as
select distinct t.headline_id, array_agg(t.tag) as tags
  from _tags t
  -- TODO this is not robust code; change will require edits in two places :(
  where
    not t.tag like '#_%'
    and not t.tag like '@_%'
    and not t.tag like '\__%'
    and t.tag <> '%inc'
  group by t.headline_id;

create temporary table _created_timestamps as
select
  hp.headline_id,
  to_timestamp(p.val_text, '[YYYY-MM-DD Dy HH24:MI]') as created_timestamp
  from headline_properties hp
  join properties p using (property_id)
  where
    p.key_text = 'CREATED'
    and not p.val_text is NULL;

-- temp table to hold all headlines with scheduled repeaters
create temporary table _repeaters as
select * from headlines h
  join _scheduled_timestamps ts using (headline_id)
  join timestamp_repeaters tr using (timestamp_id);

create temporary table _todo_closures as
select
  parents.headline_id as parent_id,
  children.headline_id as child_id,
  hc.depth as depth
  from headline_closures hc
  join headlines parents on parents.headline_id = hc.parent_id
  join headlines children on children.headline_id = hc.headline_id
  where
    -- this ensures we only get headlines that are children of other headlines
    hc.depth > 0
    -- the parents must have a keyword
    and not parents.keyword is NULL
    -- the children must have a keyword
    and not children.keyword is NULL;

create temporary table _iterator_headlines as
select distinct
  children.headline_id,
  children.headline_text,
  children.keyword
  from headline_closures hc
  join headlines parents on parents.headline_id = hc.parent_id
  join headlines children on children.headline_id = hc.headline_id
  join headline_properties parent_hp on parents.headline_id = parent_hp.headline_id
  join properties parent_props on parent_hp.property_id = parent_props.property_id
  where
    hc.depth > 0
    and parent_props.key_text = 'PARENT_TYPE'
    and parent_props.val_text = 'iterator';

create temporary table _clock_sums as
select
  c.headline_id,
  to_timestamp(c.time_start) as clock_start,
  to_timestamp(c.time_end) as clock_end,
  sum(c.time_end - c.time_start) / 60.0 as clock_sum
  from clocks c
  group by c.headline_id, c.time_start, c.time_end;

-- clock sums partitioned by DONE -> TODO state changes (this mostly useful for
-- repeaters although all headlines are included in this calculation)
create temporary table _partitioned_clock_sums as
-- with
--   -- this table will have all the clock times with the done->todo state changes
--   -- inserted in between (sorted by clock start time/state change time)
--   tmp as (
-- 	select
--       c.file_path,
--       c.headline_offset,
--       c.time_start,
--       c.time_end,
--       NULL as state_change_offset
--       from clocks c
--     union
--     select
--       le.file_path,
--       le.headline_offset,
--       le.time_logged as time_start,
--       NULL as time_end,
--       le.entry_offset as state_change_offset
--       from logbook_entries le
--       join state_changes sc using (file_path, entry_offset)
--       where
--         sc.state_old = 'TODO'
--         and (sc.state_new = 'DONE' or sc.state_new = 'CANC')
--   ),
--   -- this table will number each "group" of timestamps, where a "group" is
--   -- defined by timestamps under the same headline (eg matching file_path and
--   -- headline_offset) that are partitioned by the todo->done state change
--   -- entries (if any, if only one or none, there will only be one group under
--   -- one headline)
--   --
--   -- NOTE 1: the WHERE clause is in the next outer query since state-change rows
--   -- themselves (which are removed by the WHERE) are necessary to define the
--   -- groups)
--   --
--   -- NOTE 2: if a headline does not have any state changes, it may get the same
--   -- group index as the last group of the previous headline. This shouldn't
--   -- matter, since the GROUP BY in the outer query also takes the file_path and
--   -- headline_offset into account
--   grouped as (
-- 	select
--       t.file_path,
-- 	  t.headline_offset,
-- 	  t.time_start,
-- 	  t.time_end,
-- 	  t.state_change_offset,
-- 	  sum(case when t.state_change_offset is not null then 1 end)
-- 	  over (order by t.file_path,
--               t.headline_offset,
--               t.time_start desc,
--               t.state_change_offset desc)
-- 	  as grp
--       from tmp t
--   ),
--   offsets as (
--     select
--       g.file_path,
--       g.headline_offset,
--       g.grp,
--       g.state_change_offset
--       from grouped g
--     where
--       not g.state_change_offset is NULL
--   )
--   select
--     g.file_path,
--     g.headline_offset,
--     min(g.time_start) as partitioned_time_start,
--     max(g.time_end) as partitioned_time_end,
--     sum(g.time_end - g.time_start) / 60.0 as partitioned_clock_sum,
--     o.state_change_offset
--     from grouped g
--   right join offsets o using (file_path, headline_offset, grp)
--   where
--     g.state_change_offset is NULL
--   group by g.file_path, g.headline_offset, g.grp, o.state_change_offset;

with
  -- this table will have all the clock times with the done->todo state changes
  -- inserted in between (sorted by clock start time/state change time)
  tmp as (
	select
      c.headline_id,
      c.time_start,
      c.time_end,
      NULL as state_change_id
      from clocks c
    union
    select
      le.headline_id,
      le.time_logged as time_start,
      NULL as time_end,
      le.entry_id as state_change_id
      from logbook_entries le
      join state_changes sc using (entry_id)
      where
        sc.state_old = 'TODO'
        and (sc.state_new = 'DONE' or sc.state_new = 'CANC')
  ),
  -- this table will number each "group" of timestamps, where a "group" is
  -- defined by timestamps under the same headline (eg matching file_path and
  -- headline_offset) that are partitioned by the todo->done state change
  -- entries (if any, if only one or none, there will only be one group under
  -- one headline)
  --
  -- NOTE 1: the WHERE clause is in the next outer query since state-change rows
  -- themselves (which are removed by the WHERE) are necessary to define the
  -- groups)
  --
  -- NOTE 2: if a headline does not have any state changes, it may get the same
  -- group index as the last group of the previous headline. This shouldn't
  -- matter, since the GROUP BY in the outer query also takes the file_path and
  -- headline_offset into account
  grouped as (
	select
	  t.headline_id,
	  t.time_start,
	  t.time_end,
	  t.state_change_id,
	  sum(case when t.state_change_id is not null then 1 end)
	  over (order by t.headline_id, t.time_start desc, t.state_change_id desc)
	  as grp
      from tmp t
  ),
  ids as (
    select g.headline_id, g.grp, g.state_change_id from grouped g
    where
      not g.state_change_id is NULL
  ),
  sums as (
	select
      g.headline_id,
	  g.grp,
      to_timestamp(min(g.time_start)) as partitioned_clock_start,
      to_timestamp(max(g.time_end)) as partitioned_clock_end,
      sum(g.time_end - g.time_start) / 60.0 as partitioned_clock_sum
      from grouped g
    where
      g.state_change_id is NULL
    group by g.headline_id, g.grp
  )
  select
    o.headline_id,
    o.state_change_id,
    s.partitioned_clock_start,
    s.partitioned_clock_end,
    s.partitioned_clock_sum
    from ids o
  left join sums s using (headline_id, grp);

create temporary table _habit_headlines as
select
  r.*,
  cs.partitioned_clock_start,
  cs.partitioned_clock_end,
  cs.partitioned_clock_sum,
  s.state_old,
  s.state_new,
  to_timestamp(le.time_logged) as closed_timestamp
  from _repeaters r
  join headline_properties hp using (headline_id)
  join properties p using (property_id)
  join _partitioned_clock_sums cs using (headline_id)
  left join state_changes s
  on s.entry_id = cs.state_change_id
  left join logbook_entries le
  on le.headline_id = r.headline_id and le.entry_id = s.entry_id
  where
    p.key_text = 'STYLE'
    and p.val_text = 'habit';

create temporary table _repeater_headlines as
select
  r.*,
  cs.partitioned_clock_start,
  cs.partitioned_clock_end,
  cs.partitioned_clock_sum,
  s.state_old,
  s.state_new,
  to_timestamp(le.time_logged) as closed_timestamp
  from _repeaters r
  join _partitioned_clock_sums cs using (headline_id)
  left join state_changes s
  on s.entry_id = cs.state_change_id
  left join logbook_entries le
  on le.headline_id = r.headline_id and le.entry_id = s.entry_id
  where
    not exists (select * from _habit_headlines habits
      where r.headline_id = habits.headline_id);

create temporary table _project_task_headlines as
select distinct h.* from _todo_closures tc
  join headlines h on tc.child_id = h.headline_id
  where
    not exists (select * from _iterator_headlines i
                 where i.headline_id = h.headline_id);

-- drop table if exists viz.project_toplevel_headlines;
-- create table viz.project_toplevel_headlines as
-- select distinct
--   h.file_path,
--   h.headline_offset,
--   h.keyword,
--   h.headline_text
--   from _todo_closures tc0
--   join headlines h
--   on tc0.file_path = h.file_path and tc0.parent_offset = h.headline_offset
--   where
--     tc0.depth = 1
--     and not exists (select * from _todo_closures tc1
--                    where tc1.child_offset = tc0.parent_offset);

create temporary table _task_parent_mappings as
with
  maxdepth as (
	select t.child_id, max(t.depth) as depth
    from _todo_closures t
    group by t.child_id
  )
select tc.parent_id, tc.child_id from maxdepth m
  join _todo_closures tc using (child_id, depth);

-- -- TODO this will be more useful if I can also link it easily with the
-- -- toplevel headline
-- drop table if exists viz.project_parent_headlines;
-- create table viz.project_parent_headlines as
-- select distinct
--   h.file_path,
--   h.headline_offset,
--   h.keyword,
--   h.headline_text
--   from _todo_closures tc
--   join headlines h
--   on tc.file_path = h.file_path and tc.parent_offset = h.headline_offset
--   where
--     not exists
--       (select * from _iterator_headlines i
--         where
--          i.file_path = h.file_path
--          and i.headline_offset = h.headline_offset)
--     and not exists
--       (select * from viz.project_toplevel_headlines t
--         where
--           t.file_path = h.file_path
--           and t.headline_offset = h.headline_offset);


create temporary table _atomic_tasks as
select * from headlines h
  where
    not h.keyword is NULL
    and not exists (select * from _project_task_headlines pt
      where pt.headline_id = h.headline_id)
    -- and not exists (select * from viz.project_parent_headlines pp
    --   where pp.headline_id = h.headline_id)
    -- and not exists (select * from viz.project_toplevel_headlines pl
    --   where pl.headline_id = h.headline_id)
    and not exists (select * from _task_parent_mappings m
      where m.parent_id = h.headline_id)
    and not exists (select * from _repeaters r
      where r.headline_id = h.headline_id)
    and not exists (select * from _iterator_headlines i
      where i.headline_id = h.headline_id);

create temporary table _iterator_tasks as
select * from headlines h
  where
    not h.keyword is NULL
    and exists (select * from _iterator_headlines i
      where i.headline_id = h.headline_id)
    and not exists (select * from _task_parent_mappings m
      where m.parent_id = h.headline_id);

-- drop table if exists viz.atomic_tasks;
-- create table viz.atomic_tasks as
-- select
--   a.*,
--   cs.clock_sum,
--   to_timestamp(s.time_start) as scheduled_time,
--   to_timestamp(d.time_start) as deadline_time,
--   to_timestamp(c.time_start) as closed_time,
--   ct.tag as category,
--   xt.tag as context,
--   rt.tags as resources,
--   t.tags as tags,
--   cr.created_timestamp,
--   (ih.tag is not NULL) as incubated
--   from _atomic_tasks a
--   left join _clock_sums cs using (headline_offset, file_path)
--   left join _scheduled_timestamps s using (file_path, headline_offset)
--   left join _deadline_timestamps d using (file_path, headline_offset)
--   left join _closed_timestamps c using (file_path, headline_offset)
--   left join _category_tags ct using (file_path, headline_offset)
--   left join _context_tags xt using (file_path, headline_offset)
--   left join _resource_tags rt using (file_path, headline_offset)
--   left join _other_tags t using (file_path, headline_offset)
--   left join _created_timestamps cr using (file_path, headline_offset)
--   left join _incubated_headlines ih using (file_path, headline_offset);

-- TODO this doesn't have iterators (yet)
drop table if exists viz.all_tasks;
create table viz.all_tasks as
with
  all_tasks as (
    select
      r.headline_id,
      r.state_new as keyword,
      r.partitioned_clock_start as clock_start,
      r.partitioned_clock_end as clock_end,
      r.partitioned_clock_sum as clock_sum,
      r.closed_timestamp,
      'repeater' as task_type
      from _repeater_headlines r
    union all
    select
      h.headline_id,
      h.state_new as keyword,
      h.partitioned_clock_start as clock_start,
      h.partitioned_clock_end as clock_end,
      h.partitioned_clock_sum as clock_sum,
      h.closed_timestamp,
      'habit' as task_type
      from _habit_headlines h
    union all
    -- TODO this is redundant to have 'tasks' made twice from different sources
    select
      a.headline_id,
      a.keyword,
      cs.clock_start,
      cs.clock_end,
      cs.clock_sum,
      to_timestamp(c.time_start) as closed_timestamp,
      'atomic' as task_type
      from _atomic_tasks a
      left join _clock_sums cs using (headline_id)
      left join _closed_timestamps c using (headline_id)
    union all
    select
      p.headline_id,
      p.keyword,
      cs.clock_start,
      cs.clock_end,
      cs.clock_sum,
      to_timestamp(c.time_start) as closed_timestamp,
      'project' as task_type
      from _project_task_headlines p
      left join _clock_sums cs using (headline_id)
      left join _closed_timestamps c using (headline_id)
    union all
    select
      i.headline_id,
      i.keyword,
      cs.clock_start,
      cs.clock_end,
      cs.clock_sum,
      to_timestamp(c.time_start) as closed_timestamp,
      'iterator' as task_type
      from _iterator_tasks i
      left join _clock_sums cs using (headline_id)
      left join _closed_timestamps c using (headline_id)
  )
select
  f.file_path,
  a.*,
  tm.parent_id as project_parent_id,
  to_timestamp(s.time_start) as scheduled_timestamp,
  to_timestamp(d.time_start) as deadline_timestamp,
  h.headline_text,
  h.effort,
  h.priority,
  h.is_archived,
  h.is_commented,
  h.content,
  ct.tag as category,
  xt.tag as context,
  rt.tags as resources,
  t.tags as tags,
  cr.created_timestamp,
  (ih.tag is not NULL) as incubated
  from all_tasks a
  join headlines h using (headline_id)
  join file_metadata f using (outline_hash)
  left join _scheduled_timestamps s using (headline_id)
  left join _deadline_timestamps d using (headline_id)
  left join _category_tags ct using (headline_id)
  left join _context_tags xt using (headline_id)
  left join _resource_tags rt using (headline_id)
  left join _other_tags t using (headline_id)
  left join _created_timestamps cr using (headline_id)
  left join _incubated_headlines ih using (headline_id)
  left join _task_parent_mappings tm on tm.child_id = h.headline_id
  order by a.headline_id, a.closed_timestamp desc;

drop table if exists viz.sleep_length;
create table viz.sleep_length as
with
  tmp as (
    select distinct
    to_localtime(clock_start) as time_start_clock,
    to_localtime(scheduled_timestamp) as time_sched,
    clock_sum,
    clock_start,
    clock_end
    from viz.all_tasks
    where
      headline_text = 'sleep')
  select distinct
    clock_start as sleep_timestamp,
    time_sched as sleep_sched,
    clock_sum / 60.0 as sleep_hours,
    time_start_clock as sleep_start_clock,
    -- day of week that sleep starts; subtract 12 hours off timestamp to count
    -- bedtime after midnight as starting on the previous day
    extract(dow from (clock_start - (12||' hours')::interval) at time zone 'US/Eastern')
      as sleep_start_day,
    mod((extract(hour from time_start_clock) * 60
         - extract(hour from time_sched) * 60
		 + extract(minute from time_start_clock)
		 - extract(minute from time_sched)
         + 720 * 3)::bigint,
		1440) / 1440.0 * 24 - 12 as sleep_start_offset from tmp;

drop table if exists viz.check_email_2;
create table viz.check_email_2 as
-- ASSUME that the repeater has the correct time of day, but may be offset
-- by a multiple of 7 days
select
  clock_start,
  extract(epoch from clock_start::time - scheduled_timestamp::time)/60 as offset_minutes
  from viz.all_tasks
where
  headline_text = 'check email 2'
order by clock_start desc;

end

$$;