# 使用multicolumns statistics帮助Optimizer计算出更准确的cardinality

optimizer对于cardinality值的估算是否准确关系到能否生成最优的执行计划，而cardinality值估算的准确性又取决于SQL中各个对象的统计信息是否完整、是否能真实反映出对象的数据分布情况。因此使用何种方法收集统计信息是很有讲究的：对于数据倾斜度较大的表开启histogram，在此基础上如果有多个列存在相关性，那么multicolumns statistics又是一个更好的选择，下面用实验来证明multicolumns statistics的独到之处

###创建测试用表
drop table cgtest1;
create table cgtest1 (c1 number,c2 varchar2(2),c3 varchar2(20)) tablespace ts_info_dat_01;

declare
begin
for i in 1..5000 loop
insert into cgtest1 values(1,'AA',dbms_random.string('l',20));
insert into cgtest1 values(2,'BB',dbms_random.string('l',20));
insert into cgtest1 values(3,'CC',dbms_random.string('l',20));
insert into cgtest1 values(4,'DD',dbms_random.string('l',20));
end loop;
commit;
end;
/

insert into cgtest1 values(11,'A','AAAAAAA');
insert into cgtest1 values(22,'B','BBBBBBB');
insert into cgtest1 values(33,'C','CCCCCCC');
insert into cgtest1 values(44,'D','DDDDDDD');
commit;

SQL> select count(1) from cgtest1;

COUNT(1)
----------
20004

select c1,c2,count(1) from cgtest1 group by c1,c2;
C1 C2   COUNT(1)
---------- -- ----------
1 AA       5000
2 BB       5000
3 CC       5000
4 DD       5000
11 A           1
22 B           1
33 C           1
44 D           1

###收集cgtest1表的统计信息(但不收集histogram信息)
---收集前确认默认的estimate_percent为auto_sample_size
SQL> SELECT dbms_stats.get_prefs('estimate_percent',NULL,NULL) from dual;

DBMS_STATS.GET_PREFS('ESTIMATE_PERCENT',NULL,NULL)
------------------------------------------------------------------------------------------------------------------------------------------------------
DBMS_STATS.AUTO_SAMPLE_SIZE

exec dbms_stats.gather_table_stats(ownname=>'ad',tabname=>'cgtest1',method_opt=>'FOR ALL COLUMNS SIZE 1');

set linesize 150
SQL> select owner,table_name,NUM_DISTINCT,sample_size,column_name,histogram from dba_tab_col_statistics where owner='AD' and table_name='CGTEST1';

OWNER                          TABLE_NAME                     NUM_DISTINCT SAMPLE_SIZE COLUMN_NAME                    HISTOGRAM
------------------------------ ------------------------------ ------------ ----------- ------------------------------ ---------------
AD                             CGTEST1                                   8       20004 C1                             NONE
AD                             CGTEST1                                   8       20004 C2                             NONE
AD                             CGTEST1                               19938       20004 C3                             NONE

---c1=1 and c2='AA'实际返回值5000与optimizer估算值313还是有不少差距
SQL> select count(*) from cgtest1 where c1=1 and c2='AA';

COUNT(*)
----------
5000

explain plan for select count(*) from cgtest1 where c1=1 and c2='AA';

set linesize 150
SQL> select * from table(dbms_xplan.display());

PLAN_TABLE_OUTPUT
------------------------------------------------------------------------------------------------------------------------------------------------------
Plan hash value: 4200988577

------------------------------------------------------------------------------
| Id  | Operation          | Name    | Rows  | Bytes | Cost (%CPU)| Time     |
------------------------------------------------------------------------------
|   0 | SELECT STATEMENT   |         |     1 |     6 |    21   (0)| 00:00:01 |
|   1 |  SORT AGGREGATE    |         |     1 |     6 |            |          |
|*  2 |   TABLE ACCESS FULL| CGTEST1 |   313 |  1878 |    21   (0)| 00:00:01 |
------------------------------------------------------------------------------

---c1=11 and c2='A'实际返回值1与optimizer估算值313还是有不少差距
SQL> select count(*) from cgtest1 where c1=11 and c2='A';

COUNT(*)
----------
1

explain plan for select count(*) from cgtest1 where c1=11 and c2='A';
SQL> select * from table(dbms_xplan.display());

PLAN_TABLE_OUTPUT
------------------------------------------------------------------------------------------------------------------------------------------------------
Plan hash value: 4200988577

------------------------------------------------------------------------------
| Id  | Operation          | Name    | Rows  | Bytes | Cost (%CPU)| Time     |
------------------------------------------------------------------------------
|   0 | SELECT STATEMENT   |         |     1 |     6 |    21   (0)| 00:00:01 |
|   1 |  SORT AGGREGATE    |         |     1 |     6 |            |          |
|*  2 |   TABLE ACCESS FULL| CGTEST1 |   313 |  1878 |    21   (0)| 00:00:01 |
------------------------------------------------------------------------------

###收集c1、c2列的直方图后重新执行上面两个查询
exec dbms_stats.gather_table_stats(ownname=>'ad',tabname=>'cgtest1',method_opt=>'FOR ALL COLUMNS SIZE 1 FOR COLUMNS c1 size skewonly,c2 size skewonly');

set linesize 170
SQL> select owner,table_name,NUM_DISTINCT,density,num_buckets,sample_size,column_name,histogram from dba_tab_col_statistics where owner='AD' and table_name='CGTEST1';

OWNER                          TABLE_NAME                     NUM_DISTINCT    DENSITY NUM_BUCKETS SAMPLE_SIZE COLUMN_NAME                    HISTOGRAM
------------------------------ ------------------------------ ------------ ---------- ----------- ----------- ------------------------------ ---------------
AD                             CGTEST1                                   8 .000024995           8       20004 C1                             FREQUENCY
AD                             CGTEST1                                   8 .000024995           8       20004 C2                             FREQUENCY
AD                             CGTEST1                               19938 .000050155           1       20004 C3                             NONE

SQL> col column_name format a30
SQL> col endpoint_actual_value format a50
SQL> set linesize 170
SQL> set pagesize 100
select owner,table_name,column_name,endpoint_number,endpoint_value from dba_tab_histograms where table_name='CGTEST1';
OWNER                          TABLE_NAME                     COLUMN_NAME                    ENDPOINT_NUMBER ENDPOINT_VALUE
------------------------------ ------------------------------ ------------------------------ --------------- --------------

---c1=1 and c2='AA'作为predicate执行查询，看下这次是否cardinality值会更加接近真实返回值
select count(*) from cgtest1 where c1=1 and c2='AA';
COUNT(*)
----------
5000

set linesize 150
SQL> explain plan for select count(*) from cgtest1 where c1=1 and c2='AA';

Explained.

SQL> select * from table(dbms_xplan.display());
PLAN_TABLE_OUTPUT
------------------------------------------------------------------------------------------------------------------------------------------------------
Plan hash value: 4200988577

------------------------------------------------------------------------------
| Id  | Operation          | Name    | Rows  | Bytes | Cost (%CPU)| Time     |
------------------------------------------------------------------------------
|   0 | SELECT STATEMENT   |         |     1 |     6 |    21   (0)| 00:00:01 |
|   1 |  SORT AGGREGATE    |         |     1 |     6 |            |          |
|*  2 |   TABLE ACCESS FULL| CGTEST1 |  1250 |  7500 |    21   (0)| 00:00:01 |
------------------------------------------------------------------------------

optimizer里的rows是这样预估出来的:num_rows*(5000/20004)*(5000/20004)=20004*0.0624=1248.2496，相比313更接近于真实值5000，可见有了histogram之后的估算更加准确了

---c1=11 and c2='A'作为predicate执行查询，看下这次是否cardinality值会更加接近真实返回值
SQL> select count(*) from cgtest1 where c1=11 and c2='A';

COUNT(*)
----------
1

explain plan for select count(*) from cgtest1 where c1=11 and c2='A';
SQL> select * from table(dbms_xplan.display());
PLAN_TABLE_OUTPUT
------------------------------------------------------------------------------------------------------------------------------------------------------
Plan hash value: 4200988577

------------------------------------------------------------------------------
| Id  | Operation          | Name    | Rows  | Bytes | Cost (%CPU)| Time     |
------------------------------------------------------------------------------
|   0 | SELECT STATEMENT   |         |     1 |     6 |    21   (0)| 00:00:01 |
|   1 |  SORT AGGREGATE    |         |     1 |     6 |            |          |
|*  2 |   TABLE ACCESS FULL| CGTEST1 |     1 |     6 |    21   (0)| 00:00:01 |
------------------------------------------------------------------------------

optimizer里的rows是这样预估出来的:num_rows*(1/20004)*(1/20004)=0.00005，近似取值为1

1、采纳系统检测工作负载后给出的建议值后收集统计，如果DBA对表里数据构成情况及表中哪些列具有相关性事先不知道的情况下可以采用这种方法，oracle会根据当前的负载给出哪些表里的哪几个列之间存在相关性的建议，DBA如果采纳这个建议就可以在这几个列上创建出column group
2、手动创建column group后再收集统计信息，对表中具有相关性的列心知肚明，就可以使用手动创建的方法

###方法1：采纳系统检测工作负载后给出的建议值来生成column group

---针对select count(*) from cgtest1 where c1=1 and c2='AA'让oracle生成创建column group的建议
exec dbms_stats.seed_col_usage(NULL,NULL,TIME_limit=>100);

explain plan for select count(*) from cgtest1 where c1=1 and c2='AA';

set long 20000
set pagesize 100

--------------------------------------------------------------------------------
LEGEND:
.......

EQ         : Used in single table EQuality predicate
RANGE      : Used in single table RANGE predicate
LIKE       : Used in single table LIKE predicate
NULL       : Used in single table is (not) NULL predicate
EQ_JOIN    : Used in EQuality JOIN predicate
NONEQ_JOIN : Used in NON EQuality JOIN predicate
FILTER     : Used in single table FILTER predicate
JOIN       : Used in JOIN predicate
GROUP_BY   : Used in GROUP BY expression
...............................................................................

###############################################################################

..................................

1. C1                                  : EQ
2. C2                                  : EQ
3. (C1, C2)                            : FILTER
###############################################################################

***根据上面(C1, C2):filter的建议，生成column group: SYS_STUF3GLKIOP5F4B0BTTCFTMX0W
--------------------------------------------------------------------------------
###############################################################################

.........................

1. (C1, C2)                            : SYS_STUF3GLKIOP5F4B0BTTCFTMX0W created
###############################################################################

***dba_stat_extensions查询column group信息
COL EXtension format a50
set linesize 170
SQL> select * from dba_stat_extensions where table_name='CGTEST1';

OWNER                          TABLE_NAME                     EXTENSION_NAME                 EXTENSION                                          CREATO DRO
------------------------------ ------------------------------ ------------------------------ -------------------------------------------------- ------ ---
AD                             CGTEST1                        SYS_STUF3GLKIOP5F4B0BTTCFTMX0W ("C1","C2")                                        USER   YES

***SYS_STUF3GLKIOP5F4B0BTTCFTMX0W是系统为column group自动生成的名称，可以把它看作表中的一个列，针对SYS_STUF3GLKIOP5F4B0BTTCFTMX0W列生成统计信息
set linesize 170
col extension format a15
SQL> select t1.owner,t1.table_name,t1.column_name,t2.extension,NUM_DISTINCT,sample_size,histogram from dba_tab_col_statistics t1,dba_stat_extensions t2 where t1.owner='AD' and t1.table_name='CGTEST1' and t1.owner=t2.owner and t1.table_name=t2.table_name and t1.column_name=t2.EXTENSION_NAME;

no rows selected

exec dbms_stats.gather_table_stats(ownname=>'ad',tabname=>'cgtest1',method_opt=>'FOR COLUMNS SYS_STUF3GLKIOP5F4B0BTTCFTMX0W SIZE skewonly');

SQL> select t1.owner,t1.table_name,t1.column_name,t2.extension,NUM_DISTINCT,sample_size,histogram from dba_tab_col_statistics t1,dba_stat_extensions t2 where t1.owner='AD' and t1.table_name='CGTEST1' and t1.owner=t2.owner and t1.table_name=t2.table_name and t1.column_name=t2.EXTENSION_NAME;

OWNER                          TABLE_NAME                     COLUMN_NAME                    EXTENSION       NUM_DISTINCT SAMPLE_SIZE HISTOGRAM
------------------------------ ------------------------------ ------------------------------ --------------- ------------ ----------- ---------------
AD                             CGTEST1                        SYS_STUF3GLKIOP5F4B0BTTCFTMX0W ("C1","C2")                8       20004 FREQUENCY

---从sql cursor cache里兜取出语句部分语句让oracle来评估(需要先建立sql tuning set)
EXEC DBMS_SQLTUNE.CREATE_SQLSET('cgsts1');

DECLARE
cur DBMS_SQLTUNE.SQLSET_CURSOR;
BEGIN
OPEN cur FOR SELECT VALUE(P) FROM table(DBMS_SQLTUNE.SELECT_CURSOR_CACHE(basic_filter=>'parsing_schema_name <> ''SYS'' AND sql_text like ''select%and%'' AND first_load_time > ''2015-01-01/01:36:34'' and first_load_time < ''2015-01-22/01:36:34''',ranking_measure1=>'first_load_time',result_limit=>20)) P;
END;
/

SQL> select count(*) from dba_sqlset_statements where sqlset_name='cgsts1';

COUNT(*)
----------
20

***使用dbms_stats.seed_col_usage对cgsts1里的20条sql给出是否创建column group的建议

grant analyze any,analyze any dictionary to ad;

***针对sql tuning set中的某个表生成建议报告，前提是这个表必须要有统计信息
set long 2000000
set pagesize 500

***下面是报告详细内容，最后一行用(ACCT_ID, BILL_MONTH, STS)，表明这三个字段是一起进行查询的，可以建立一个基于此三个字段的column group
--------------------------------------------------------------------------------
LEGEND:
.......

EQ         : Used in single table EQuality predicate
RANGE      : Used in single table RANGE predicate
LIKE       : Used in single table LIKE predicate
NULL       : Used in single table is (not) NULL predicate
EQ_JOIN    : Used in EQuality JOIN predicate
NONEQ_JOIN : Used in NON EQuality JOIN predicate
FILTER     : Used in single table FILTER predicate
JOIN       : Used in JOIN predicate
GROUP_BY   : Used in GROUP BY expression
...............................................................................

###############################################################################

..................................................

1. R_ID                             : EQ
2. BEGIN_DATE                          : EQ RANGE
3. BILL_MONTH                          : EQ
4. END_DATE                            : EQ
5. RATE_ID                             : EQ_JOIN
6. STS                                 : EQ
7. (ACCT_ID, BILL_MONTH, STS)          : FILTER
###############################################################################

***创建column group

###方法2：手动创建column group
---手动创建column group后再通过dbms_stats.gather_table_stats收集统计
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
SYS_STU3RTXGYOX7NS\$MIUDXQDMQ0C

exec dbms_stats.gather_table_stats(ownname=>'AD',tabname=>'cgtest1',method_opt=>'FOR COLUMNS SYS_STU3RTXGYOX7NS\$MIUDXQDMQ0C SIZE skewonly');)

---或者一步到位：直接对c1、c2列执行统计信息收集，同时也会生成column group
EXEC DBMS_STATS.gather_table_stats('ad','cgtest2',method_opt=>'for columns (c1,c2) size skewonly');

###生成了column group statistics之后我们再次执行一开始的那句sql：select count(*) from cgtest1 where c1=1 and c2='AA'，看看是否能帮助optimizer算出更精确的cardinality
---先来看看对于代表(c1,c2)的SYS_STUF3GLKIOP5F4B0BTTCFTMX0W列在dba_tab_histogram里的数据分布情况
SQL> col column_name format a30
SQL> col endpoint_actual_value format a50
SQL> set linesize 170
SQL> set pagesize 100
SQL> select owner,table_name,column_name,endpoint_number,endpoint_value from dba_tab_histograms where table_name='CGTEST1' and column_name='SYS_STUF3GLKIOP5F4B0BTTCFTMX0W';
OWNER                          TABLE_NAME                     COLUMN_NAME                    ENDPOINT_NUMBER ENDPOINT_VALUE
------------------------------ ------------------------------ ------------------------------ --------------- --------------

---预测一下有了基于(c1、c2)的column groups后，select count(*) from cgtest1 where c1=1 and c2='AA'的cardinality返回值会变成多少
cardinality=num_rows*5000/20004=20004*5000/20004=5000

---实际执行结果与我们的计算结果一致
SQL> select count(*) from cgtest1 where c1=1 and c2='AA';

COUNT(*)
----------
5000

explain plan for select count(*) from cgtest1 where c1=1 and c2='AA';

set linesize 150
SQL> select * from table(dbms_xplan.display());

PLAN_TABLE_OUTPUT
------------------------------------------------------------------------------------------------------------------------------------------------------
Plan hash value: 4200988577

------------------------------------------------------------------------------
| Id  | Operation          | Name    | Rows  | Bytes | Cost (%CPU)| Time     |
------------------------------------------------------------------------------
|   0 | SELECT STATEMENT   |         |     1 |     6 |    21   (0)| 00:00:01 |
|   1 |  SORT AGGREGATE    |         |     1 |     6 |            |          |
|*  2 |   TABLE ACCESS FULL| CGTEST1 |  5000 | 30000 |    21   (0)| 00:00:01 |
------------------------------------------------------------------------------

• 博文量
128
• 访问量
1616859