$$%% examples \newcommand{\exGraph}{\graph_{\mathrm{ex}}} \newcommand{\exOnto}{\onto_{\mathrm{ex}}} \newcommand{\exMappings}{\mappings_{\mathrm{ex}}} \newcommand{\exExtensions}{\extensions_{\mathrm{ex}}} \newcommand{\exRule}{r_{\mathrm{ex}}} \newcommand{\RDFSrules}{\rules_{\mathrm{RDFS}}} %% RDF \newcommand{\triple}[3]{(#1, #2, #3)} \newcommand{\tuple}[1]{\langle #1 \rangle} \newcommand{\subject}{\mathtt{s}} \newcommand{\prop}{\mathtt{p}} \newcommand{\object}{\mathtt{o}} \newcommand{\blank}{\_{:}b} \newcommand{\blankn}[1]{\_{:}#1} \newcommand{\irin}[1]{{:}\mathrm{#1}} \newcommand{\class}{\mathtt{c}} \newcommand{\nsrdf}{\mathrm{rdf{:}}} \newcommand{\nsrdfs}{\mathrm{rdfs{:}}} \newcommand{\rdftype}{\mathrm{rdf{:}type}} \newcommand{\rdfLiteral}{\mathrm{rdf{:}Literal}} \newcommand{\rdfssubClassOf}{\mathrm{rdfs{:}subClassOf}} \newcommand{\rdfssubPropertyOf}{\mathrm{rdfs{:}subPropertyOf}} \newcommand{\rdfsdomain}{\mathrm{rdfs{:}domain}} \newcommand{\rdfsrange}{\mathrm{rdfs{:}range}} \newcommand{\rdfsClass}{\mathrm{rdfs{:}Class}} \newcommand{\rdfProperty}{\mathrm{rdf{:}Property}} \newcommand{\xsdint}{\mathrm{xsd{:}int}} %% \newcommand{\type}{\tau} \newcommand{\subclass}{\prec_{sc}} \newcommand{\subproperty}{\prec_{sp}} \newcommand{\domain}{\hookleftarrow_{d}} \newcommand{\range}{\hookrightarrow_{r}} \newcommand{\rdfentailment}{\vdash_{^\mathrm{RDF}}} \newcommand{\RDFS}[1]{\mathrm{RDFS}(#1)} \newcommand{\aka}{a.k.a.~} \newcommand{\etc}{etc} \newcommand{\wrt}{w.r.t.~} \newcommand{\st}{s.t.~} \newcommand{\ie}{i.e.,~} \newcommand{\eg}{e.g.,~} \newcommand{\graph}{G} \newcommand{\rules}{\mathcal{R}} \newcommand{\sources}{\mathcal{S}} \newcommand{\views}{\mathcal{V}} \newcommand{\extensions}{\mathcal{E}} \newcommand{\onto}{\mathcal{O}} \newcommand{\mappings}{\mathcal{M}} \newcommand{\modelsrdf}{\models_\rules} \newcommand{\bgp}{P} \newcommand{\Bl}[1]{\mathrm{Bl}(#1)} \newcommand{\Val}[1]{\mathrm{Val}(#1)} \newcommand{\Var}[1]{\mathrm{Var(#1)}} \newcommand{\ext}[1]{\mathrm{ext}(#1)} \newcommand{\cert}{\mathrm{cert}} \newcommand{\ans}{\mathrm{ans}} \newcommand{\query}{\leftarrow} \newcommand{\body}[1]{\textrm{body}(#1)} \newcommand{\head}[1]{\textrm{head}(#1)} \newcommand{\cs}{\mathrm{cs}} \newcommand{\lcs}{\mathrm{lcs}} \newcommand{\cl}{\mathrm{cl}} \newcommand{\lua}{\mathrm{lua}} \newcommand{\lur}{\mathrm{lur}} \newtheorem{lemma}{Lemma} \newtheorem{definition}{Definition} \newtheorem{problem}{Problem} \newtheorem{property}{Property} \newtheorem{corollary}{Corollary} \newtheorem{example}{Example} \newtheorem{theorem}{Theorem} \newcommand{\URIs}{\mathscr U} \newcommand{\IRIs}{\mathscr I} \newcommand{\BNodes}{\mathscr B} \newcommand{\Literals}{\mathscr L} \newcommand{\Variables}{\mathscr V} % DB \newcommand{\CQ}{\ensuremath{\mathtt{CQ}}\xspace} \newcommand{\UCQ}{\ensuremath{\mathtt{UCQ}}\xspace} \newcommand{\SQL}{\ensuremath{\mathtt{SQL}}\xspace} \newcommand{\rel}[1]{\mathsf{#1}} % Cost model \newcommand{\cans}[1]{|#1|_t} \newcommand{\cref}[1]{|#1|_r} \newcommand{\db}{\mathtt{db}} % DL \newcommand{\cn}{\ensuremath{N_{C}}\xspace} \newcommand{\rn}{\ensuremath{N_{R}}\xspace} \newcommand{\inds}{\ensuremath{N_{I}}\xspace} \newcommand{\ainds}{\ensuremath{\mathrm{Ind}}\xspace} \newcommand{\funct}{\mathit{funct} \ } \newcommand{\KB}{\mathcal{K}\xspace} \newcommand{\dlr}{DL-Lite$_{\mathcal{R}}$\xspace} % Logics \newcommand{\FOL}{\ensuremath{\mathtt{FOL}}\xspace} \newcommand{\datalog}{\ensuremath{\mathtt{Datalog}}\xspace} \newcommand{\dllite}{DL-Lite\xspace} \newcommand{\true}{\mathrm{true}} \newcommand{\false}{\mathrm{false}} \newcommand{\dis}{\mathtt{dis}} \newcommand{\vars}[1]{\ensuremath{\mathrm{vars}(#1)}} %\newcommand{\terms}[1]{\ensuremath{\mathrm{terms}(#1)}} %math \renewcommand{\phi}{\varphi} \newcommand\eqdef{\stackrel{\mathclap{\normalfont\mbox{def}}}{=}} \newcommand\restr[2]{#1_{|#2}} \newcommand{\ontoBody}[1]{\mathrm{body}_\onto(#1)} %proof of the rewriting theorem \newcommand{\rdfGraph}{\graph^{\mappings}_{\extensions}} \newcommand\systemGraph{\graph^{\mappings \cup \mappings^{\text{STD}}_\onto}_{\extensions \cup \extensions_\onto}} \newcommand\viewsGraph{\graph^{\mappings^{\rules,\onto} \cup \mappings^{\text{STD}}_\onto}_{\extensions \cup \extensions_\onto}} \newcommand{\standMappings}{\mappings^{\text{STD}}_\onto} \newcommand{\reminder}[1]{[\vadjust{\vbox to0pt{\vss\hbox to0pt{\hss{\Large $\Longrightarrow$}}}}{{\textsf{\small #1}}}]} %\newcommand{\FG}[1]{\textcolor{blue}{\reminder{FG:~#1}}} \newcommand{\extVersion}{false} \newcommand{\printIfExtVersion}[2] { \ifthenelse{\equal{\extVersion}{true}}{#1}{} \ifthenelse{\equal{\extVersion}{false}}{#2}{} } \newcommand{\bda}{\true} \newcommand{\ifBDA}[2]% {% \ifthenelse{\equal{\bda}{true}}{#1}{}% \ifthenelse{\equal{\bda}{false}}{#2}{}% } %%% Local Variables: %%% TeX-master: "paper" %%% End: $$

Checking Postgres Statistics

Checking the histogram

Consider a table tab and a column col, we get its histogram using:

SELECT histogram_bounds FROM pg_stats
WHERE tablename='tab' AND attname='col';

We can check the histogram quality by counting the real number of rows between each bounds of the histogram:

WITH histogram AS 
(SELECT unnest(histogram_bounds::text::int[]) AS bound 
FROM pg_stats
WHERE tablename='tab' AND attname='col') ,

intervals AS 
(select (ROW_NUMBER() OVER (ORDER BY histogram.bound)) - 1 AS index,
        lag(bound) over (order by bound) AS min, 
        bound AS max
 FROM histogram OFFSET 1),

real_histogram AS (
select index, min, max, count(col) AS count
FROM tab AS tab, intervals AS i 
WHERE tab.col >= i.min AND tab.col < i.max 
GROUP BY index, min, max
ORDER BY index) 

select index AS bucket, min, max , count , repeat('■',
               (   count::float
                 / max(count) over()
                 * 30
               )::int
        ) as bar
from real_histogram;

We obtain a result of this form:

index min max count bar
1 3818 379263 1009 ■■■■■■■■■■■■■■■■■■
2 379263 796818 1294 ■■■■■■■■■■■■■■■■■■■■■■■
3 796818 1244509 1021 ■■■■■■■■■■■■■■■■■■
4 1244509 1704733 1179 ■■■■■■■■■■■■■■■■■■■■■
5 1704733 2097371 1111 ■■■■■■■■■■■■■■■■■■■■
6 2097371 2371027 916 ■■■■■■■■■■■■■■■■
7 2371027 2761796 1260 ■■■■■■■■■■■■■■■■■■■■■■
8 2761796 3100446 1043 ■■■■■■■■■■■■■■■■■■
9 3100446 3509108 1103 ■■■■■■■■■■■■■■■■■■■
10 3509108 3808976 1034 ■■■■■■■■■■■■■■■■■■

Checking the Most Common Values

Consider a table tab and a column col, we get the most common values with their frequencies using:

SELECT  most_common_vals, most_common_freqs FROM pg_stats
WHERE tablename='tab' AND attname='col';

We can display the estimated rows and the actual with their normalized difference using bars (one square each 5%) as follow:

WITH n_rows AS (
SELECT count(*) AS n_rows FROM tab),

T AS 
(SELECT n_rows, 
        unnest(most_common_vals::text::int[]) AS mcv, 
        unnest(most_common_freqs::text::float[]) AS mcf
FROM pg_stats, n_rows
WHERE tablename='tab' AND attname='col'),

mcv_check AS (
SELECT mcv, (mcf * n_rows)::int AS estimated_n_rows, count(*) AS n_rows
FROM T, tab AS tab WHERE tab.col = mcv 
GROUP BY mcv, mcf, n_rows)

SELECT mcv, estimated_n_rows, n_rows,
repeat('■',
((abs(n_rows - estimated_n_rows)::float / n_rows::float)* 20)::int) as deviation
FROM mcv_check
ORDER BY mcv;
mcv estimatednrows nrows deviation
95759 10008 9018 ■■■
142005 6255 6119
438087 12197 11849
799888 32087 31076
827510 18014 17767
1519231 8882 8746
2826276 17451 16572
3240017 3002 3076
3549370 3002 3571 ■■■■
3646595 7631 6922 ■■
3656433 3127 2150 ■■■■■■■■■
3693069 2877 2482 ■■■■
4525288 8444 6986 ■■■■■
4540224 3628 4128 ■■■
4868255 10758 9889 ■■
5430681 4003 4855 ■■■■■
5566825 5317 4729 ■■■
6285203 7568 8474 ■■■