
任何一个关系型数据库关于模糊匹配(like)的优化都是一件痛苦的事,相对而言,诸如like 'abc%'之类的还好一点,可以通过创建索引来优化,但对于like 'c%'之类的,真的就没有办法了。

这里介绍一种postgresql关于like 'c%'的优化方法,是基于全文检索的特性来实现的。

测试数据准备(环境centos6.5 + postgresql 9.6.1)。

postgres=# create table ts(id int,name text);
postgres=# \d ts
Table "public.ts"
Column | Type  | Modifiers
id   | integer |
name  | text  |
postgres=# insert into ts select n,n||'_pjy' from generate_series(1,2000) n;
INSERT 0 2000
postgres=# insert into ts select n,n||'_mdh' from generate_series(1,2000000) n;
INSERT 0 2000000
postgres=# insert into ts select n,n||'_lmm' from generate_series(1,2000000) n;
INSERT 0 2000000
postgres=# insert into ts select n,n||'_syf' from generate_series(1,2000000) n;
INSERT 0 2000000
postgres=# insert into ts select n,n||'_wbd' from generate_series(1,2000000) n;
INSERT 0 2000000
postgres=# insert into ts select n,n||'_hhh' from generate_series(1,2000000) n;
INSERT 0 2000000
postgres=# insert into ts select n,n||'_sjw' from generate_series(1,2000000) n;
INSERT 0 2000000
postgres=# insert into ts select n,n||'_jjs' from generate_series(1,2000000) n;
INSERT 0 2000000
postgres=# insert into ts select n,n||'_ymd' from generate_series(1,2000000) n;
INSERT 0 2000000
postgres=# insert into ts select n,n||'_biu' from generate_series(1,2000000) n;
INSERT 0 2000000
postgres=# insert into ts select n,n||'_dfl' from generate_series(1,2000000) n;
INSERT 0 2000000
postgres=# select count(*) from ts;
(1 row)


postgres=# explain analyze select * from ts where name like '%pjy%';
                        QUERY PLAN
 Seq Scan on ts (cost=0.00..358144.05 rows=2000 width=15) (actual time=0.006..1877.087 rows=2000 loops=1)
  Filter: (name ~~ '%pjy%'::text)
  Rows Removed by Filter: 20000000
 Planning time: 0.031 ms
 Execution time: 1877.178 ms
(5 rows)


postgres=# create index idx_name on ts using gin (to_tsvector('english',name));
postgres=# vacuum analyze ts;
postgres=# \d ts
   Table "public.ts"
 Column | Type  | Modifiers
 id   | integer |
 name  | text  |
  "idx_name" gin (to_tsvector('english'::regconfig, name))
postgres=# explain analyze select * from ts where to_tsvector('english',name) @@ to_tsquery('pjy');
                           QUERY PLAN
 Bitmap Heap Scan on ts (cost=39.75..8187.70 rows=2000 width=15) (actual time=0.016..0.016 rows=0 loops=1)
  Recheck Cond: (to_tsvector('english'::regconfig, name) @@ to_tsquery('pjy'::text))
  -> Bitmap Index Scan on idx_name (cost=0.00..39.25 rows=2000 width=0) (actual time=0.016..0.016 rows=0 loops=1)
     Index Cond: (to_tsvector('english'::regconfig, name) @@ to_tsquery('pjy'::text))
 Planning time: 0.094 ms
 Execution time: 0.036 ms
(6 rows)










执行计划对比(gbase8s vs postgresql):



SET EXPLAIN FILE TO '/home/gbasedbt/sqexplain.out' ;
select skip 0 first 15 * from ( select * from T_SZGL_JDRY order by T_SZGL_JDRY.updatetime desc ) Estimated Cost: 3207 Estimated # of Rows Returned: 6172 ​ 1) gbasedbt.t_szgl_jdry: INDEX PATH  (1) Index Name: gbasedbt.i_t_szgl_jdry_updatetime    Index Keys: updatetime (Reverse) (Serial, fragments: ALL) QUERY: (OPTIMIZATION TIMESTAMP: 12-21-2017 03:20:43) ------ select skip 0 first 15 * from ( select * from T_SZGL_JDRY order by T_SZGL_JDRY.updatetime desc ) Estimated Cost: 232 Estimated # of Rows Returned: 6172 1) (Temp Table For Collection Subquery): SEQUENTIAL SCAN Query statistics: ----------------- The final cost of the plan is reduced because of the FIRST n specification in the query. ​ Table map : ---------------------------- Internal name   Table name ---------------------------- t1        t_szgl_jdry t2        (Temp Table For Collection Subquery) type   table rows_prod est_rows rows_scan time    est_cost ------------------------------------------------------------------- scan   t1   6173    6172   6173    00:00.05  3207  --查询执行用 222 ms,15行受影响


select skip 0 first 15 * from T_SZGL_JDRY order by T_SZGL_JDRY.updatetime desc ​ Estimated Cost: 7 Estimated # of Rows Returned: 6172 ​ 1) gbasedbt.t_szgl_jdry: INDEX PATH  (1) Index Name: gbasedbt.i_t_szgl_jdry_updatetime    Index Keys: updatetime (Reverse) (Serial, fragments: ALL) Query statistics: ----------------- The final cost of the plan is reduced because of the FIRST n specification in the query. ​ Table map : ---------------------------- Internal name   Table name ---------------------------- t1        t_szgl_jdry ​ type   table rows_prod est_rows rows_scan time    est_cost ------------------------------------------------------------------- scan   t1   15     6172   15     00:00.00  8    ​ QUERY: (OPTIMIZATION TIMESTAMP: 12-21-2017 03:23:25) ------ select 1 from sysusers Estimated Cost: 2 Estimated # of Rows Returned: 1 1) gbasedbt.sysusers: SEQUENTIAL SCAN ... --查询执行用 18 ms,15行受影响

第一个执行计划中 (1) (Temp Table For Collection Subquery): SEQUENTIAL SCAN)可以看出是将子查询的结果查询出来后,在这个基础上获取了15条记录



db_jcxxzypt=# explain select * from db_jcxx.t_jcxxzy_tjaj order by d_slrq limit 15 offset 0;                        QUERY PLAN                         ------------------------------------------------------------------------- Limit (cost=0.44..28.17 rows=15 width=879)  -> Index Scan using idx_ttjaj_dslrq on t_jcxxzy_tjaj (cost=0.44..32374439.85 rows=17507700 width=879) (2 rows) --子查询执行计划-嵌套一层 db_jcxxzypt=# explain db_jcxxzypt-# select * from ( db_jcxxzypt(# select * from db_jcxx.t_jcxxzy_tjaj order by d_slrq db_jcxxzypt(# )tab1 limit 15 offset 0;                        QUERY PLAN                         ------------------------------------------------------------------------- Limit (cost=0.44..28.32 rows=15 width=879)  -> Index Scan using idx_ttjaj_dslrq on t_jcxxzy_tjaj (cost=0.44..32374439.85 rows=17507700 width=879) (2 rows) ​ --子查询执行计划-嵌套两层 db_jcxxzypt=# explain db_jcxxzypt-# select * from ( db_jcxxzypt(# select * from ( db_jcxxzypt(# select * from db_jcxx.t_jcxxzy_tjaj order by d_slrq db_jcxxzypt(# )tab1 )tab2 limit 15 offset 0;                        QUERY PLAN                         ------------------------------------------------------------------------- Limit (cost=0.44..28.32 rows=15 width=879)  -> Index Scan using idx_ttjaj_dslrq on t_jcxxzy_tjaj (cost=0.44..32374439.85 rows=17507700 width=879) (2 rows)



子查询可分为三类:一、([not]in/all/any/some),二、([not]exists),三、其他子查询(sjp子查询 选择、投影、连接)

子查询可以出现在目标列、form子句、where子句、join/on子句、group by子句、having子句、orderby子句等位置。

db_jcxxzypt=# explain select * from t_jcxxzy_tjaj aj ,(select * from t_jcxxzy_ajdsr) dsr where dsr.c_ajbm = '1301020400000120090101';                   QUERY PLAN                    ------------------------------------------------------------------------- Nested Loop (cost=0.56..1252119.58 rows=17507700 width=1098)  -> Index Scan using idx_tajdsr_cajbm on t_jcxxzy_ajdsr (cost=0.56..8.57 rows=1 width=219)     Index Cond: (c_ajbm = '1301020400000120090101'::bpchar)  -> Seq Scan on t_jcxxzy_tjaj aj (cost=0.00..1077034.00 rows=17507700 width=879) (4 rows) ​ Time: 1.101 ms





db_jcxxzypt=# explain select * from t_jcxxzy_tjaj aj where aj.c_ajbm in (select dsr.c_ajbm from t_jcxxzy_ajdsr dsr); 转化为: select * from t_jcxxzy_tjaj aj join t_jcxxzy_ajdsr dsr aj.c_ajbm = dsr.c_ajbm;                           QUERY PLAN                       ------------------------------------------------------------------------- Hash Semi Join (cost=362618.61..5537768.07 rows=7957409 width=879)  Hash Cond: (t_jcxxzy_tjaj.c_ajbm = t_jcxxzy_ajdsr.c_ajbm)  -> Seq Scan on t_jcxxzy_tjaj (cost=0.00..1077034.00 rows=17507700 width=879)  -> Hash (cost=237458.59..237458.59 rows=6817202 width=23)     -> Index Only Scan using idx_tajdsr_cajbm on t_jcxxzy_ajdsr (cost=0.56..237458.59 rows=6817202 wi dth=23) (5 rows) --in等价于=any hash semi join表示执行的是两张表的hash半连接, 原始sql中没有(t_jcxxzy_tjaj.c_ajbm = t_jcxxzy_ajdsr.c_ajbm),表明此in子查询被优化,优化后采用hash semi join算法。 (2).相关子查询 --当加入条件where aj.d_slrq='2001-06-14'后不能提升子链接,如果把where aj.d_slrq ='2001-06-14'放到父查询 是支持子链接优化的 db_jcxxzypt=# explain db_jcxxzypt-# select * from t_jcxxzy_tjaj aj where c_ajbm in (select c_ajbm from t_jcxxzy_ajdsr dsr where aj.d_slrq='2001-06-14') ;                            QUERY PLAN                           ------------------------------------------------------------------------- Seq Scan on t_jcxxzy_tjaj aj (cost=0.00..2227874766580.75 rows=8753850 width=879)  Filter: (SubPlan 1)  SubPlan 1   -> Result (cost=0.56..237458.59 rows=6817202 width=23)      One-Time Filter: (aj.d_slrq = '2001-06-14'::date)      -> Index Only Scan using idx_tajdsr_cajbm on t_jcxxzy_ajdsr dsr (cost=0.56..237458.59 rows=6817 202 width=23) (6 rows (3). -- not in不能提升子链接 db_jcxxzypt=# explain select * from db_jcxx.t_jcxxzy_tjaj where c_ajbm not in (select c_ajbm from db_jcxx.t_jcxxzy_ajdsr);                           QUERY PLAN                           ------------------------------------------------------------------------- Seq Scan on t_jcxxzy_tjaj (cost=0.56..2875921362927.06 rows=8753850 width=879)  Filter: (NOT (SubPlan 1))  SubPlan 1   -> Materialize (cost=0.56..311489.60 rows=6817202 width=23)      -> Index Only Scan using idx_tajdsr_cajbm on t_jcxxzy_ajdsr (cost=0.56..237458.59 rows=6817202 width=23) (5 rows) --not in与<>all含义相同




db_jcxxzypt=# explain
db_jcxxzypt-# select * from t_jcxxzy_tjaj aj where exists (select c_ajbm from t_jcxxzy_ajdsr dsr where aj.c_ajbm = dsr.c_ajbm);                            QUERY PLAN                           ------------------------------------------------------------------------- Hash Semi Join (cost=362618.61..5537768.07 rows=7957409 width=879)  Hash Cond: (aj.c_ajbm = dsr.c_ajbm)  -> Seq Scan on t_jcxxzy_tjaj aj (cost=0.00..1077034.00 rows=17507700 width=879)  -> Hash (cost=237458.59..237458.59 rows=6817202 width=23)     -> Index Only Scan using idx_tajdsr_cajbm on t_jcxxzy_ajdsr dsr (cost=0.56..237458.59 rows=681720 2 width=23) (5 rows) -- 当加入where aj.c_xzdm = '150622'条件在子链接时,仍然支持上拉 db_jcxxzypt=# explain db_jcxxzypt-# select * from t_jcxxzy_tjaj aj where exists (select c_ajbm from t_jcxxzy_ajdsr dsr where aj.c_xzdm = '150622');                          QUERY PLAN                          ------------------------------------------------------------------------- Nested Loop Semi Join (cost=0.56..1361779.20 rows=5436 width=879)  -> Seq Scan on t_jcxxzy_tjaj aj (cost=0.00..1120803.25 rows=5436 width=879)     Filter: ((c_xzdm)::text = '150622'::text)  -> Index Only Scan using idx_tajdsr_cajbm on t_jcxxzy_ajdsr dsr (cost=0.56..237458.59 rows=6817202 widt h=0) (4 rows) --exists子链接 db_jcxxzypt=# explain db_jcxxzypt-# select * from t_jcxxzy_tjaj aj where exists (select c_ajbm from t_jcxxzy_ajdsr dsr where dsr.c_ajbm='1101120300000120030101') db_jcxxzypt-# ;                        QUERY PLAN                        ------------------------------------------------------------------------- Result (cost=4.58..1077038.57 rows=17507700 width=879)  One-Time Filter: $0  InitPlan 1 (returns $0)   -> Index Only Scan using idx_tajdsr_cajbm on t_jcxxzy_ajdsr dsr (cost=0.56..4.58 rows=1 width=0)      Index Cond: (c_ajbm = '1101120300000120030101'::bpchar)  -> Seq Scan on t_jcxxzy_tjaj aj (cost=0.00..1077034.00 rows=17507700 width=879) (6 rows) 


--not exists子链接 db_jcxxzypt=# explain db_jcxxzypt-# select * from t_jcxxzy_tjaj aj where not exists (select c_ajbm from t_jcxxzy_ajdsr dsr);                   QUERY PLAN                   ------------------------------------------------------------------------- Result (cost=0.04..1077034.04 rows=17507700 width=879)  One-Time Filter: (NOT $0)  InitPlan 1 (returns $0)   -> Seq Scan on t_jcxxzy_ajdsr dsr (cost=0.00..281210.02 rows=6817202 width=0)  -> Seq Scan on t_jcxxzy_tjaj aj (cost=0.00..1077034.00 rows=17507700 width=879) (5 rows) 

从执行计划上看,not exists子查询并没有被消除,子查询只是执行了一次,将结果作为aj表的参数。



db_jcxxzypt=# explain select * from db_jcxx.t_jcxxzy_tjaj where c_ajbm >all(select c_ajbm from db_jcxx.t_jcxxzy_ajdsr);                           QUERY PLAN                      ------------------------------------------------------------------------- Seq Scan on t_jcxxzy_tjaj (cost=0.56..2875921362927.06 rows=8753850 width=879)  Filter: (SubPlan 1)  SubPlan 1   -> Materialize (cost=0.56..311489.60 rows=6817202 width=23)      -> Index Only Scan using idx_tajdsr_cajbm on t_jcxxzy_ajdsr (cost=0.56..237458.59 rows=6817202 width=23) (5 rows) ​ db_jcxxzypt=# explain select * from db_jcxx.t_jcxxzy_tjaj where c_ajbm =all(select c_ajbm from db_jcxx.t_jcxxzy_ajdsr);                           QUERY PLAN                          ------------------------------------------------------------------------- Seq Scan on t_jcxxzy_tjaj (cost=0.56..2875921362927.06 rows=8753850 width=879)  Filter: (SubPlan 1)  SubPlan 1   -> Materialize (cost=0.56..311489.60 rows=6817202 width=23)      -> Index Only Scan using idx_tajdsr_cajbm on t_jcxxzy_ajdsr (cost=0.56..237458.59 rows=6817202 width=23) (5 rows) ​ db_jcxxzypt=# explain select * from db_jcxx.t_jcxxzy_tjaj where c_ajbm <all(select c_ajbm from db_jcxx.t_jcxxzy_ajdsr);                           QUERY PLAN                          ------------------------------------------------------------------------- Seq Scan on t_jcxxzy_tjaj (cost=0.56..2875921362927.06 rows=8753850 width=879)  Filter: (SubPlan 1)  SubPlan 1   -> Materialize (cost=0.56..311489.60 rows=6817202 width=23)      -> Index Only Scan using idx_tajdsr_cajbm on t_jcxxzy_ajdsr (cost=0.56..237458.59 rows=6817202 width=23) (5 rows) 




db_jcxxzypt=#explain select * from db_jcxx.t_jcxxzy_tjaj where c_ajbm >some(select c_ajbm from db_jcxx.t_jcxxzy_ajdsr);                         QUERY PLAN                         ------------------------------------------------------------------------- - Nested Loop Semi Join (cost=0.56..11316607.35 rows=5835900 width=879)  -> Seq Scan on t_jcxxzy_tjaj (cost=0.00..1077034.00 rows=17507700 width=879)  -> Index Only Scan using idx_tajdsr_cajbm on t_jcxxzy_ajdsr (cost=0.56..64266.97 rows=2272401 width=23)     Index Cond: (c_ajbm < t_jcxxzy_tjaj.c_ajbm) (4 rows) ​ db_jcxxzypt=#explain select * from db_jcxx.t_jcxxzy_tjaj where c_ajbm =some(select c_ajbm from db_jcxx.t_jcxxzy_ajdsr);                           QUERY PLAN                          ------------------------------------------------------------------------- Hash Semi Join (cost=362618.61..5537768.07 rows=7957409 width=879)  Hash Cond: (t_jcxxzy_tjaj.c_ajbm = t_jcxxzy_ajdsr.c_ajbm)  -> Seq Scan on t_jcxxzy_tjaj (cost=0.00..1077034.00 rows=17507700 width=879)  -> Hash (cost=237458.59..237458.59 rows=6817202 width=23)     -> Index Only Scan using idx_tajdsr_cajbm on t_jcxxzy_ajdsr (cost=0.56..237458.59 rows=6817202 wi dth=23) (5 rows) ​ db_jcxxzypt=#explain select * from db_jcxx.t_jcxxzy_tjaj where c_ajbm <some(select c_ajbm from db_jcxx.t_jcxxzy_ajdsr);                         QUERY PLAN                         ------------------------------------------------------------------------- ​ Nested Loop Semi Join (cost=0.56..11316607.35 rows=5835900 width=879)  -> Seq Scan on t_jcxxzy_tjaj (cost=0.00..1077034.00 rows=17507700 width=879)  -> Index Only Scan using idx_tajdsr_cajbm on t_jcxxzy_ajdsr (cost=0.56..64266.97 rows=2272401 width=23)     Index Cond: (c_ajbm > t_jcxxzy_tjaj.c_ajbm) (4 rows) --some中未出现子查询,dsr表都被上拉到父查询中,与aj表进行嵌套半连接和hash半连接



不支持带有with子句的格式,集合操作、聚集函数(aggregates、group、distinct)、cte、having、limit/offset等子句格式 ​

db_jcxxzypt=# explain select * from t_jcxxzy_tjaj aj ,(select * from t_jcxxzy_ajdsr limit 10) dsr where dsr.c_ajbm = '1301020400000120090101';                     QUERY PLAN                     ------------------------------------------------------------------------- Nested Loop (cost=0.00..1252111.54 rows=17507700 width=1098)  -> Subquery Scan on dsr (cost=0.00..0.54 rows=1 width=219)     Filter: (dsr.c_ajbm = '1301020400000120090101'::bpchar)     -> Limit (cost=0.00..0.41 rows=10 width=219)        -> Seq Scan on t_jcxxzy_ajdsr (cost=0.00..281210.02 rows=6817202 width=219)  -> Seq Scan on t_jcxxzy_tjaj aj (cost=0.00..1077034.00 rows=17507700 width=879) (6 rows) ​ Time: 0.958 ms




join或子查询的优化,属于优化器优化JOIN的范畴。 ​



#from_collapse_limit = 8


#join_collapse_limit = 8        # 1 disables collapsing of explicit
                    # JOIN clauses 当使用显示的JOIN时(除了full join),例如a join b join c join d,优化器可以重排JOIN的顺序,以产生更多的PLAN选择更优的执行计划。 如果join_collapse_limit=1,则不重排,使用SQL写法提供的顺序。 ​ 如果用户要固化JOIN顺序,请使用显示的JOIN,同时将join_collapse_limit设置为1。 如果用户不打算提升子查询,同样的,将from_collapse_limit 设置为1即可。


子查询中没有group by子句,也没有聚集函数,则可使用下面的等价转换

val>all(select...) to val>max(select...)
val<all(select...) to val<min(select...)
val>any(select...) to val>min(select...) val<any(select...) to val<max(select...) val>=all(select...) to val>=max(select...) val<=all(select...) to val<=min(select...) val>=any(select...) to val>=min(select...) val<=any(select...) to val<=max(select...) 




select * from t_jcxxzy_tjaj aj where c_ajbm in (select c_ajbm from t_jcxxzy_ajdsr dsr where dsr.c_ajbm = aj.c_ajbm)/* 子查询语句中存在父查询的列 */


select * from t_jcxxzy_tjaj aj where c_ajbm in (select c_ajbm from t_jcxxzy_ajdsr dsr where dsr.c_xzdm = '150622')/* 子查询语句中不存在父查询的属性 */







6.not exists虽然没有被上拉,但是被优化为只执行一次,相对于not in稍好






