Skip to content
This repository was archived by the owner on Jan 28, 2021. It is now read-only.

Commit 7beeb8d

Browse files
authored
Merge pull request #572 from erizocosmico/feature/prune-columns
analyzer: add rule to prune unnecessary columns
2 parents b8d84bf + 14c6aeb commit 7beeb8d

File tree

5 files changed

+545
-2
lines changed

5 files changed

+545
-2
lines changed

engine_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -981,14 +981,14 @@ func TestDescribe(t *testing.T) {
981981

982982
query := `DESCRIBE FORMAT=TREE SELECT * FROM mytable`
983983
expectedSeq := []sql.Row{
984-
sql.NewRow("Table(mytable): Projected "),
984+
sql.NewRow("Table(mytable)"),
985985
sql.NewRow(" ├─ Column(i, INT64, nullable=false)"),
986986
sql.NewRow(" └─ Column(s, TEXT, nullable=false)"),
987987
}
988988

989989
expectedParallel := []sql.Row{
990990
{"Exchange(parallelism=2)"},
991-
{" └─ Table(mytable): Projected "},
991+
{" └─ Table(mytable)"},
992992
{" ├─ Column(i, INT64, nullable=false)"},
993993
{" └─ Column(s, TEXT, nullable=false)"},
994994
}

sql/analyzer/optimization_rules.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,10 @@ func reorderProjection(ctx *sql.Context, a *Analyzer, n sql.Node) (sql.Node, err
116116
return node, nil
117117
}
118118

119+
if len(requiredColumns) == 0 {
120+
return node, nil
121+
}
122+
119123
didNeedReorder = true
120124

121125
// Only add the required columns for that node in the projection.

sql/analyzer/prune_columns.go

Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
package analyzer
2+
3+
import (
4+
"fmt"
5+
6+
"gopkg.in/src-d/go-mysql-server.v0/sql"
7+
"gopkg.in/src-d/go-mysql-server.v0/sql/expression"
8+
"gopkg.in/src-d/go-mysql-server.v0/sql/plan"
9+
)
10+
11+
type usedColumns map[string]map[string]struct{}
12+
13+
func pruneColumns(ctx *sql.Context, a *Analyzer, n sql.Node) (sql.Node, error) {
14+
a.Log("pruning columns, node of type %T", n)
15+
if !n.Resolved() {
16+
return n, nil
17+
}
18+
19+
columns := make(usedColumns)
20+
21+
// All the columns required for the output of the query must be mark as
22+
// used, otherwise the schema would change.
23+
for _, col := range n.Schema() {
24+
if _, ok := columns[col.Source]; !ok {
25+
columns[col.Source] = make(map[string]struct{})
26+
}
27+
columns[col.Source][col.Name] = struct{}{}
28+
}
29+
30+
findUsedColumns(columns, n)
31+
32+
n, err := addSubqueryBarriers(n)
33+
if err != nil {
34+
return nil, err
35+
}
36+
37+
n, err = pruneUnusedColumns(n, columns)
38+
if err != nil {
39+
return nil, err
40+
}
41+
42+
n, err = pruneSubqueries(ctx, a, n, columns)
43+
if err != nil {
44+
return nil, err
45+
}
46+
47+
return fixRemainingFieldsIndexes(n)
48+
}
49+
50+
func pruneSubqueryColumns(
51+
ctx *sql.Context,
52+
a *Analyzer,
53+
n *plan.SubqueryAlias,
54+
parentColumns usedColumns,
55+
) (sql.Node, error) {
56+
a.Log("pruning columns of subquery with alias %q", n.Name())
57+
58+
columns := make(usedColumns)
59+
60+
// The columns coming from the parent have the subquery alias name as the
61+
// source. We need to find the real table in order to prune the subquery
62+
// correctly.
63+
tableByCol := make(map[string]string)
64+
for _, col := range n.Child.Schema() {
65+
tableByCol[col.Name] = col.Source
66+
}
67+
68+
for col := range parentColumns[n.Name()] {
69+
table, ok := tableByCol[col]
70+
if !ok {
71+
// This should never happen, but better be safe than sorry.
72+
return nil, fmt.Errorf("this is likely a bug: missing projected column %q on subquery %q", col, n.Name())
73+
}
74+
75+
if _, ok := columns[table]; !ok {
76+
columns[table] = make(map[string]struct{})
77+
}
78+
79+
columns[table][col] = struct{}{}
80+
}
81+
82+
findUsedColumns(columns, n)
83+
84+
node, err := addSubqueryBarriers(n.Child)
85+
if err != nil {
86+
return nil, err
87+
}
88+
89+
node, err = pruneUnusedColumns(node, columns)
90+
if err != nil {
91+
return nil, err
92+
}
93+
94+
node, err = pruneSubqueries(ctx, a, node, columns)
95+
if err != nil {
96+
return nil, err
97+
}
98+
99+
// There is no need to fix the field indexes after pruning here
100+
// because the main query will take care of fixing the indexes of all the
101+
// nodes in the tree.
102+
103+
return plan.NewSubqueryAlias(n.Name(), node), nil
104+
}
105+
106+
func findUsedColumns(columns usedColumns, n sql.Node) {
107+
plan.Inspect(n, func(n sql.Node) bool {
108+
switch n := n.(type) {
109+
case *plan.Project:
110+
addUsedProjectColumns(columns, n.Projections)
111+
return true
112+
case *plan.GroupBy:
113+
addUsedProjectColumns(columns, n.Aggregate)
114+
addUsedColumns(columns, n.Grouping)
115+
return true
116+
case *plan.SubqueryAlias:
117+
return false
118+
}
119+
120+
exp, ok := n.(sql.Expressioner)
121+
if ok {
122+
addUsedColumns(columns, exp.Expressions())
123+
}
124+
125+
return true
126+
})
127+
}
128+
129+
func addSubqueryBarriers(n sql.Node) (sql.Node, error) {
130+
return n.TransformUp(func(n sql.Node) (sql.Node, error) {
131+
sq, ok := n.(*plan.SubqueryAlias)
132+
if !ok {
133+
return n, nil
134+
}
135+
136+
return &subqueryBarrier{sq}, nil
137+
})
138+
}
139+
140+
func pruneSubqueries(
141+
ctx *sql.Context,
142+
a *Analyzer,
143+
n sql.Node,
144+
parentColumns usedColumns,
145+
) (sql.Node, error) {
146+
return n.TransformUp(func(n sql.Node) (sql.Node, error) {
147+
barrier, ok := n.(*subqueryBarrier)
148+
if !ok {
149+
return n, nil
150+
}
151+
152+
return pruneSubqueryColumns(ctx, a, barrier.SubqueryAlias, parentColumns)
153+
})
154+
}
155+
156+
func pruneUnusedColumns(n sql.Node, columns usedColumns) (sql.Node, error) {
157+
return n.TransformUp(func(n sql.Node) (sql.Node, error) {
158+
switch n := n.(type) {
159+
case *plan.Project:
160+
return pruneProject(n, columns), nil
161+
case *plan.GroupBy:
162+
return pruneGroupBy(n, columns), nil
163+
default:
164+
return n, nil
165+
}
166+
})
167+
}
168+
169+
type tableColumnPair struct {
170+
table string
171+
column string
172+
}
173+
174+
func fixRemainingFieldsIndexes(n sql.Node) (sql.Node, error) {
175+
return n.TransformUp(func(n sql.Node) (sql.Node, error) {
176+
exp, ok := n.(sql.Expressioner)
177+
if !ok {
178+
return n, nil
179+
}
180+
181+
var schema sql.Schema
182+
for _, c := range n.Children() {
183+
schema = append(schema, c.Schema()...)
184+
}
185+
186+
if len(schema) == 0 {
187+
return n, nil
188+
}
189+
190+
indexes := make(map[tableColumnPair]int)
191+
for i, col := range schema {
192+
indexes[tableColumnPair{col.Source, col.Name}] = i
193+
}
194+
195+
return exp.TransformExpressions(func(e sql.Expression) (sql.Expression, error) {
196+
gf, ok := e.(*expression.GetField)
197+
if !ok {
198+
return e, nil
199+
}
200+
201+
idx, ok := indexes[tableColumnPair{gf.Table(), gf.Name()}]
202+
if !ok {
203+
return nil, fmt.Errorf("unable to find column %q of table %q", gf.Name(), gf.Table())
204+
}
205+
206+
ngf := *gf
207+
return ngf.WithIndex(idx), nil
208+
})
209+
})
210+
}
211+
212+
func addUsedProjectColumns(
213+
columns usedColumns,
214+
projection []sql.Expression,
215+
) {
216+
var candidates []sql.Expression
217+
for _, e := range projection {
218+
// Only check for expressions that are not directly a GetField. This
219+
// is because in a projection we only care about those that were used
220+
// to compute new columns, such as aliases and so on. The fields that
221+
// are just passed up in the tree will already be in some other part
222+
// if they are really used.
223+
if _, ok := e.(*expression.GetField); !ok {
224+
candidates = append(candidates, e)
225+
}
226+
}
227+
228+
addUsedColumns(columns, candidates)
229+
}
230+
231+
func addUsedColumns(columns usedColumns, exprs []sql.Expression) {
232+
for _, e := range exprs {
233+
expression.Inspect(e, func(e sql.Expression) bool {
234+
if gf, ok := e.(*expression.GetField); ok {
235+
if _, ok := columns[gf.Table()]; !ok {
236+
columns[gf.Table()] = make(map[string]struct{})
237+
}
238+
columns[gf.Table()][gf.Name()] = struct{}{}
239+
}
240+
return true
241+
})
242+
}
243+
}
244+
245+
func pruneProject(n *plan.Project, columns usedColumns) sql.Node {
246+
var remaining []sql.Expression
247+
for _, e := range n.Projections {
248+
if !shouldPruneExpr(e, columns) {
249+
remaining = append(remaining, e)
250+
}
251+
}
252+
253+
if len(remaining) == 0 {
254+
return n.Child
255+
}
256+
257+
return plan.NewProject(remaining, n.Child)
258+
}
259+
260+
func pruneGroupBy(n *plan.GroupBy, columns usedColumns) sql.Node {
261+
var remaining []sql.Expression
262+
for _, e := range n.Aggregate {
263+
if !shouldPruneExpr(e, columns) {
264+
remaining = append(remaining, e)
265+
}
266+
}
267+
268+
if len(remaining) == 0 {
269+
return n.Child
270+
}
271+
272+
return plan.NewGroupBy(remaining, n.Grouping, n.Child)
273+
}
274+
275+
func shouldPruneExpr(e sql.Expression, cols usedColumns) bool {
276+
gf, ok := e.(*expression.GetField)
277+
if !ok {
278+
return false
279+
}
280+
281+
if gf.Table() == "" {
282+
return false
283+
}
284+
285+
if c, ok := cols[gf.Table()]; ok {
286+
if _, ok := c[gf.Name()]; ok {
287+
return false
288+
}
289+
}
290+
291+
return true
292+
}
293+
294+
type subqueryBarrier struct {
295+
*plan.SubqueryAlias
296+
}
297+
298+
func (b *subqueryBarrier) TransformUp(f sql.TransformNodeFunc) (sql.Node, error) {
299+
return f(b)
300+
}

0 commit comments

Comments
 (0)