From 97c2f08a4a8b30ccaaf7544e439b229fbdcc1245 Mon Sep 17 00:00:00 2001 From: ndwarshuis Date: Sun, 14 Feb 2021 22:20:27 -0500 Subject: [PATCH] ADD sql visualization script --- .gitignore | 1 + etc/conf.org | 8 +- etc/org-sql/viz_setup.sql | 528 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 534 insertions(+), 3 deletions(-) create mode 100644 etc/org-sql/viz_setup.sql diff --git a/.gitignore b/.gitignore index 530206d..016047c 100644 --- a/.gitignore +++ b/.gitignore @@ -12,5 +12,6 @@ straight/versions/* !etc etc/* +!etc/org-sql !etc/conf.org !etc/dashlogo.png diff --git a/etc/conf.org b/etc/conf.org index 43f697f..5322014 100644 --- a/etc/conf.org +++ b/etc/conf.org @@ -3330,9 +3330,11 @@ These are variables that I set for my use but will not go into the eventual pack :hostname "portnoy4prez.yavin4.ch" :password "org_sql" :username "org_sql") - - org-sql-debug nil - + ;; some SQL code to denormalize my org-file data for visualization + org-sql-post-init-hooks '((file+ (f-join no-littering-etc-directory + "org_sql" "viz_setup.sql"))) + org-sql-post-push-hooks '((sql+ "CALL make_vis_tables();")) + org-sql-debug t org-sql-files '("~/Org/.archive/" "~/Org/general.org_archive" "~/Org/general.org" diff --git a/etc/org-sql/viz_setup.sql b/etc/org-sql/viz_setup.sql new file mode 100644 index 0000000..6e19ec2 --- /dev/null +++ b/etc/org-sql/viz_setup.sql @@ -0,0 +1,528 @@ +create or replace procedure make_vis_tables() +language plpgsql +as $$ + +declare +begin + +create schema if not exists viz; + +create temporary table _scheduled_timestamps as +select ts.* from planning_entries pe + join timestamps ts using (timestamp_id) + where + pe.planning_type = 'scheduled'; + +create temporary table _deadline_timestamps as +select ts.* from planning_entries pe + join timestamps ts using (timestamp_id) + where + pe.planning_type = 'deadline'; + +create temporary table _closed_timestamps as +select ts.* from planning_entries pe + join timestamps ts using (timestamp_id) + where + pe.planning_type = 'closed'; + +create temporary table _tags as +select hc.headline_id, t.tag from headline_tags t + join headline_closures hc on hc.parent_id = t.headline_id + union + select h.headline_id, t.tag from file_tags t + join headlines h on h.outline_hash = t.outline_hash; + +create temporary table _category_tags as +select distinct * from _tags t where t.tag like '\__%'; + +-- TODO there is a small chance that headlines might have two context tags +-- if different contexts are explicitly specified on two headlines at different +-- levels +create temporary table _context_tags as +select distinct * from _tags t where t.tag like '@_%'; + +create temporary table _resource_tags as +select distinct t.headline_id, array_agg(t.tag) as tags + from _tags t + where + t.tag like '#_%' + group by t.headline_id; + +create temporary table _incubated_headlines as +select distinct * from _tags t + where + t.tag = '%inc'; + +create temporary table _other_tags as +select distinct t.headline_id, array_agg(t.tag) as tags + from _tags t + -- TODO this is not robust code; change will require edits in two places :( + where + not t.tag like '#_%' + and not t.tag like '@_%' + and not t.tag like '\__%' + and t.tag <> '%inc' + group by t.headline_id; + +create temporary table _created_timestamps as +select + hp.headline_id, + to_timestamp(p.val_text, '[YYYY-MM-DD Dy HH24:MI]') as created_timestamp + from headline_properties hp + join properties p using (property_id) + where + p.key_text = 'CREATED' + and not p.val_text is NULL; + +-- temp table to hold all headlines with scheduled repeaters +create temporary table _repeaters as +select * from headlines h + join _scheduled_timestamps ts using (headline_id) + join timestamp_repeaters tr using (timestamp_id); + +create temporary table _todo_closures as +select + parents.headline_id as parent_id, + children.headline_id as child_id, + hc.depth as depth + from headline_closures hc + join headlines parents on parents.headline_id = hc.parent_id + join headlines children on children.headline_id = hc.headline_id + where + -- this ensures we only get headlines that are children of other headlines + hc.depth > 0 + -- the parents must have a keyword + and not parents.keyword is NULL + -- the children must have a keyword + and not children.keyword is NULL; + +create temporary table _iterator_headlines as +select distinct + children.headline_id, + children.headline_text, + children.keyword + from headline_closures hc + join headlines parents on parents.headline_id = hc.parent_id + join headlines children on children.headline_id = hc.headline_id + join headline_properties parent_hp on parents.headline_id = parent_hp.headline_id + join properties parent_props on parent_hp.property_id = parent_props.property_id + where + hc.depth > 0 + and parent_props.key_text = 'PARENT_TYPE' + and parent_props.val_text = 'iterator'; + +create temporary table _clock_sums as +select + c.headline_id, + sum(c.time_end - c.time_start) / 60.0 as clock_sum + from clocks c + group by c.headline_id; + +-- clock sums partitioned by DONE -> TODO state changes (this mostly useful for +-- repeaters although all headlines are included in this calculation) +create temporary table _partitioned_clock_sums as +-- with +-- -- this table will have all the clock times with the done->todo state changes +-- -- inserted in between (sorted by clock start time/state change time) +-- tmp as ( +-- select +-- c.file_path, +-- c.headline_offset, +-- c.time_start, +-- c.time_end, +-- NULL as state_change_offset +-- from clocks c +-- union +-- select +-- le.file_path, +-- le.headline_offset, +-- le.time_logged as time_start, +-- NULL as time_end, +-- le.entry_offset as state_change_offset +-- from logbook_entries le +-- join state_changes sc using (file_path, entry_offset) +-- where +-- sc.state_old = 'TODO' +-- and (sc.state_new = 'DONE' or sc.state_new = 'CANC') +-- ), +-- -- this table will number each "group" of timestamps, where a "group" is +-- -- defined by timestamps under the same headline (eg matching file_path and +-- -- headline_offset) that are partitioned by the todo->done state change +-- -- entries (if any, if only one or none, there will only be one group under +-- -- one headline) +-- -- +-- -- NOTE 1: the WHERE clause is in the next outer query since state-change rows +-- -- themselves (which are removed by the WHERE) are necessary to define the +-- -- groups) +-- -- +-- -- NOTE 2: if a headline does not have any state changes, it may get the same +-- -- group index as the last group of the previous headline. This shouldn't +-- -- matter, since the GROUP BY in the outer query also takes the file_path and +-- -- headline_offset into account +-- grouped as ( +-- select +-- t.file_path, +-- t.headline_offset, +-- t.time_start, +-- t.time_end, +-- t.state_change_offset, +-- sum(case when t.state_change_offset is not null then 1 end) +-- over (order by t.file_path, +-- t.headline_offset, +-- t.time_start desc, +-- t.state_change_offset desc) +-- as grp +-- from tmp t +-- ), +-- offsets as ( +-- select +-- g.file_path, +-- g.headline_offset, +-- g.grp, +-- g.state_change_offset +-- from grouped g +-- where +-- not g.state_change_offset is NULL +-- ) +-- select +-- g.file_path, +-- g.headline_offset, +-- min(g.time_start) as partitioned_time_start, +-- max(g.time_end) as partitioned_time_end, +-- sum(g.time_end - g.time_start) / 60.0 as partitioned_clock_sum, +-- o.state_change_offset +-- from grouped g +-- right join offsets o using (file_path, headline_offset, grp) +-- where +-- g.state_change_offset is NULL +-- group by g.file_path, g.headline_offset, g.grp, o.state_change_offset; + +with + -- this table will have all the clock times with the done->todo state changes + -- inserted in between (sorted by clock start time/state change time) + tmp as ( + select + c.headline_id, + c.time_start, + c.time_end, + NULL as state_change_id + from clocks c + union + select + le.headline_id, + le.time_logged as time_start, + NULL as time_end, + le.entry_id as state_change_id + from logbook_entries le + join state_changes sc using (entry_id) + where + sc.state_old = 'TODO' + and (sc.state_new = 'DONE' or sc.state_new = 'CANC') + ), + -- this table will number each "group" of timestamps, where a "group" is + -- defined by timestamps under the same headline (eg matching file_path and + -- headline_offset) that are partitioned by the todo->done state change + -- entries (if any, if only one or none, there will only be one group under + -- one headline) + -- + -- NOTE 1: the WHERE clause is in the next outer query since state-change rows + -- themselves (which are removed by the WHERE) are necessary to define the + -- groups) + -- + -- NOTE 2: if a headline does not have any state changes, it may get the same + -- group index as the last group of the previous headline. This shouldn't + -- matter, since the GROUP BY in the outer query also takes the file_path and + -- headline_offset into account + grouped as ( + select + t.headline_id, + t.time_start, + t.time_end, + t.state_change_id, + sum(case when t.state_change_id is not null then 1 end) + over (order by t.headline_id, t.time_start desc, t.state_change_id desc) + as grp + from tmp t + ), + ids as ( + select g.headline_id, g.grp, g.state_change_id from grouped g + where + not g.state_change_id is NULL + ), + sums as ( + select + g.headline_id, + g.grp, + min(g.time_start) as partitioned_time_start, + max(g.time_end) as partitioned_time_end, + sum(g.time_end - g.time_start) / 60.0 as partitioned_clock_sum + from grouped g + where + g.state_change_id is NULL + group by g.headline_id, g.grp + ) + select + o.headline_id, + o.state_change_id, + s.partitioned_time_start, + s.partitioned_time_end, + s.partitioned_clock_sum + from ids o + left join sums s using (headline_id, grp); + +create temporary table _habit_headlines as +select + r.*, + cs.partitioned_clock_sum, + s.state_old, + s.state_new, + to_timestamp(le.time_logged) as closed_timestamp + from _repeaters r + join headline_properties hp using (headline_id) + join properties p using (property_id) + join _partitioned_clock_sums cs using (headline_id) + left join state_changes s + on s.entry_id = cs.state_change_id + left join logbook_entries le + on le.headline_id = r.headline_id and le.entry_id = s.entry_id + where + p.key_text = 'STYLE' + and p.val_text = 'habit'; + +create temporary table _repeater_headlines as +select + r.*, + cs.partitioned_clock_sum, + s.state_old, + s.state_new, + to_timestamp(le.time_logged) as closed_timestamp + from _repeaters r + join _partitioned_clock_sums cs using (headline_id) + left join state_changes s + on s.entry_id = cs.state_change_id + left join logbook_entries le + on le.headline_id = r.headline_id and le.entry_id = s.entry_id + where + not exists (select * from _habit_headlines habits + where r.headline_id = habits.headline_id); + +create temporary table _project_task_headlines as +select distinct h.* from _todo_closures tc + join headlines h on tc.child_id = h.headline_id + where + not exists (select * from _iterator_headlines i + where i.headline_id = h.headline_id); + +-- drop table if exists viz.project_toplevel_headlines; +-- create table viz.project_toplevel_headlines as +-- select distinct +-- h.file_path, +-- h.headline_offset, +-- h.keyword, +-- h.headline_text +-- from _todo_closures tc0 +-- join headlines h +-- on tc0.file_path = h.file_path and tc0.parent_offset = h.headline_offset +-- where +-- tc0.depth = 1 +-- and not exists (select * from _todo_closures tc1 +-- where tc1.child_offset = tc0.parent_offset); + +create temporary table _task_parent_mappings as +with + maxdepth as ( + select t.child_id, max(t.depth) as depth + from _todo_closures t + group by t.child_id + ) +select tc.parent_id, tc.child_id from maxdepth m + join _todo_closures tc using (child_id, depth); + +-- -- TODO this will be more useful if I can also link it easily with the +-- -- toplevel headline +-- drop table if exists viz.project_parent_headlines; +-- create table viz.project_parent_headlines as +-- select distinct +-- h.file_path, +-- h.headline_offset, +-- h.keyword, +-- h.headline_text +-- from _todo_closures tc +-- join headlines h +-- on tc.file_path = h.file_path and tc.parent_offset = h.headline_offset +-- where +-- not exists +-- (select * from _iterator_headlines i +-- where +-- i.file_path = h.file_path +-- and i.headline_offset = h.headline_offset) +-- and not exists +-- (select * from viz.project_toplevel_headlines t +-- where +-- t.file_path = h.file_path +-- and t.headline_offset = h.headline_offset); + +drop table if exists viz.sleep_length; +create table viz.sleep_length as +with + tmp as ( + select distinct + to_timestamp(c.time_start)::time at time zone 'US/Eastern' as time_start_clock, + c.time_start, + c.time_end + from _repeater_headlines hh + join clocks c using (headline_id) + where + hh.headline_text = 'sleep') + select distinct + to_timestamp(time_start) as sleep_timestamp, + (time_end - time_start) / 3600.0 as sleep_hours, + time_start_clock as sleep_start_clock, + -- day of week that sleep starts; subtract 12 hours off timestamp to count + -- bedtime after midnight as starting on the previous day + extract(dow from to_timestamp(time_start - 43200) at time zone 'US/Eastern') + as sleep_start_day, + -- offset from target bedtime start (assume target bedtime is 23:45) + mod((extract(hour from time_start_clock) * 60 + + extract(minute from time_start_clock) + 15 + 720)::bigint, + 1440) / 1440.0 * 24 - 12 as sleep_start_offset from tmp; + +create temporary table _atomic_tasks as +select * from headlines h + where + not h.keyword is NULL + and not exists (select * from _project_task_headlines pt + where pt.headline_id = h.headline_id) + -- and not exists (select * from viz.project_parent_headlines pp + -- where pp.headline_id = h.headline_id) + -- and not exists (select * from viz.project_toplevel_headlines pl + -- where pl.headline_id = h.headline_id) + and not exists (select * from _task_parent_mappings m + where m.parent_id = h.headline_id) + and not exists (select * from _repeaters r + where r.headline_id = h.headline_id) + and not exists (select * from _iterator_headlines i + where i.headline_id = h.headline_id); + +create temporary table _iterator_tasks as +select * from headlines h + where + not h.keyword is NULL + and exists (select * from _iterator_headlines i + where i.headline_id = h.headline_id) + and not exists (select * from _task_parent_mappings m + where m.parent_id = h.headline_id); + +-- drop table if exists viz.atomic_tasks; +-- create table viz.atomic_tasks as +-- select +-- a.*, +-- cs.clock_sum, +-- to_timestamp(s.time_start) as scheduled_time, +-- to_timestamp(d.time_start) as deadline_time, +-- to_timestamp(c.time_start) as closed_time, +-- ct.tag as category, +-- xt.tag as context, +-- rt.tags as resources, +-- t.tags as tags, +-- cr.created_timestamp, +-- (ih.tag is not NULL) as incubated +-- from _atomic_tasks a +-- left join _clock_sums cs using (headline_offset, file_path) +-- left join _scheduled_timestamps s using (file_path, headline_offset) +-- left join _deadline_timestamps d using (file_path, headline_offset) +-- left join _closed_timestamps c using (file_path, headline_offset) +-- left join _category_tags ct using (file_path, headline_offset) +-- left join _context_tags xt using (file_path, headline_offset) +-- left join _resource_tags rt using (file_path, headline_offset) +-- left join _other_tags t using (file_path, headline_offset) +-- left join _created_timestamps cr using (file_path, headline_offset) +-- left join _incubated_headlines ih using (file_path, headline_offset); + +-- TODO this doesn't have iterators (yet) +drop table if exists viz.all_tasks; +create table viz.all_tasks as +with + all_tasks as ( + select + r.headline_id, + r.state_new as keyword, + r.partitioned_clock_sum as clock_sum, + r.closed_timestamp, + 'repeater' as task_type + from _repeater_headlines r + union all + select + h.headline_id, + h.state_new as keyword, + h.partitioned_clock_sum as clock_sum, + h.closed_timestamp, + 'habit' as task_type + from _habit_headlines h + union all + -- TODO this is redundant to have 'tasks' made twice from different sources + select + a.headline_id, + a.keyword, + cs.clock_sum, + to_timestamp(c.time_start) as closed_timestamp, + 'atomic' as task_type + from _atomic_tasks a + left join _clock_sums cs using (headline_id) + left join _closed_timestamps c using (headline_id) + union all + select + p.headline_id, + p.keyword, + cs.clock_sum, + to_timestamp(c.time_start) as closed_timestamp, + 'project' as task_type + from _project_task_headlines p + left join _clock_sums cs using (headline_id) + left join _closed_timestamps c using (headline_id) + union all + select + i.headline_id, + i.keyword, + cs.clock_sum, + to_timestamp(c.time_start) as closed_timestamp, + 'iterator' as task_type + from _iterator_tasks i + left join _clock_sums cs using (headline_id) + left join _closed_timestamps c using (headline_id) + ) +select + f.file_path, + a.*, + tm.parent_id as project_parent_id, + to_timestamp(s.time_start) as scheduled_timestamp, + to_timestamp(d.time_start) as deadline_timestamp, + h.headline_text, + h.effort, + h.priority, + h.is_archived, + h.is_commented, + h.content, + ct.tag as category, + xt.tag as context, + rt.tags as resources, + t.tags as tags, + cr.created_timestamp, + (ih.tag is not NULL) as incubated + from all_tasks a + join headlines h using (headline_id) + join file_metadata f using (outline_hash) + left join _scheduled_timestamps s using (headline_id) + left join _deadline_timestamps d using (headline_id) + left join _category_tags ct using (headline_id) + left join _context_tags xt using (headline_id) + left join _resource_tags rt using (headline_id) + left join _other_tags t using (headline_id) + left join _created_timestamps cr using (headline_id) + left join _incubated_headlines ih using (headline_id) + left join _task_parent_mappings tm on tm.child_id = h.headline_id + order by a.headline_id, a.closed_timestamp desc; + +end + +$$;