Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

plan: maintain HistColl in Selection and Projection's statsInfo. #7595

Closed
wants to merge 10 commits into from
22 changes: 22 additions & 0 deletions plan/cbo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,28 @@ func (s *testAnalyzeSuite) TestInconsistentEstimation(c *C) {
))
}

func (s *testAnalyzeSuite) TestHistogramInPlan(c *C) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add comments in the test case.

defer testleak.AfterTest(c)()
store, dom, err := newStoreWithBootstrap()
c.Assert(err, IsNil)
tk := testkit.NewTestKit(c, store)
defer func() {
dom.Close()
store.Close()
}()
tk.MustExec("use test")
tk.MustExec("set @@session.tidb_optimizer_selectivity_level=1")
tk.MustExec(`create table t(a enum("a", "b", "c"), index idx(a))`)
tk.MustExec(`insert into t values("a"), ("a"), ("a"), ("a"), ("b"), ("b"), ("b"), ("C")`)
tk.MustExec("analyze table t")
tk.MustQuery("explain select * from t where a = 'c'").
Check(testkit.Rows(
"Selection_5 1.00 root eq(test.t.a, \"c\")",
"└─TableReader_7 8.00 root data:TableScan_6",
" └─TableScan_6 8.00 cop table:t, range:[-inf,+inf], keep order:false",
))
}

func newStoreWithBootstrap() (kv.Storage, *domain.Domain, error) {
store, err := mockstore.NewMockTikvStore()
if err != nil {
Expand Down
10 changes: 5 additions & 5 deletions plan/logical_plans.go
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ func (ds *DataSource) deriveTablePathStats(path *accessPath) (bool, error) {
// deriveIndexPathStats will fulfill the information that the accessPath need.
// And it will check whether this index is full matched by point query. We will use this check to
// determine whether we remove other paths or not.
func (ds *DataSource) deriveIndexPathStats(path *accessPath) (bool, error) {
func (ds *DataSource) deriveIndexPathStats(path *accessPath, histColl *statistics.HistColl) (bool, error) {
var err error
sc := ds.ctx.GetSessionVars().StmtCtx
path.ranges = ranger.FullRange()
Expand All @@ -417,7 +417,7 @@ func (ds *DataSource) deriveIndexPathStats(path *accessPath) (bool, error) {
if err != nil {
return false, errors.Trace(err)
}
path.countAfterAccess, err = ds.stats.histColl.GetRowCountByIndexRanges(sc, path.index.ID, path.ranges)
path.countAfterAccess, err = histColl.GetRowCountByIndexRanges(sc, path.index.ID, path.ranges)
if err != nil {
return false, errors.Trace(err)
}
Expand All @@ -435,8 +435,8 @@ func (ds *DataSource) deriveIndexPathStats(path *accessPath) (bool, error) {
}
path.indexFilters, path.tableFilters = splitIndexFilterConditions(path.tableFilters, path.index.Columns, ds.tableInfo)
if corColInAccessConds {
idxHist, ok := ds.stats.histColl.Indices[path.index.ID]
if ok && !ds.stats.histColl.Pseudo {
idxHist, ok := histColl.Indices[path.index.ID]
if ok && !histColl.Pseudo {
path.countAfterAccess = idxHist.AvgCountPerValue(ds.statisticTable.Count)
} else {
path.countAfterAccess = ds.statisticTable.PseudoAvgCountPerValue()
Expand All @@ -448,7 +448,7 @@ func (ds *DataSource) deriveIndexPathStats(path *accessPath) (bool, error) {
path.countAfterAccess = math.Min(ds.stats.count/selectionFactor, float64(ds.statisticTable.Count))
}
if path.indexFilters != nil {
selectivity, err := ds.stats.histColl.Selectivity(ds.ctx, path.indexFilters)
selectivity, err := histColl.Selectivity(ds.ctx, path.indexFilters)
if err != nil {
log.Warnf("An error happened: %v, we have to use the default selectivity", err.Error())
selectivity = selectionFactor
Expand Down
88 changes: 77 additions & 11 deletions plan/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ type statsInfo struct {
count float64
cardinality []float64

histColl statistics.HistColl
histColl *statistics.HistColl
// usePseudoStats indicates whether the statsInfo is calculated using the
// pseudo statistics on a table.
usePseudoStats bool
Expand All @@ -47,7 +47,6 @@ func (s *statsInfo) scale(factor float64) *statsInfo {
profile := &statsInfo{
count: s.count * factor,
cardinality: make([]float64, len(s.cardinality)),
histColl: s.histColl,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

scale is called by scaleByExpectedCnt which is called many many times when building physical plan.
It's hard to handle expectedCnt with stats estimation by Histogram. So I think in such case HistColl is not need to maintained. Thus maintain the HistColl out of this method.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But since it's the problem of scaleByExpecteCnt, why not do it in that function?

Copy link
Member Author

@winoros winoros Sep 6, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Emm.. i need some clearer explain?

usePseudoStats: s.usePseudoStats,
}
for i := range profile.cardinality {
Expand Down Expand Up @@ -110,11 +109,10 @@ func (p *baseLogicalPlan) deriveStats() (*statsInfo, error) {
return profile, nil
}

func (ds *DataSource) getStatsByFilter(conds expression.CNFExprs) *statsInfo {
profile := &statsInfo{
func (ds *DataSource) getStatsByFilter(conds expression.CNFExprs) (profile *statsInfo, histColl *statistics.HistColl) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that all the code changes related to the functions of DataSource is not irrelevant?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we can extract the same part with selection.deriveStats

profile = &statsInfo{
count: float64(ds.statisticTable.Count),
cardinality: make([]float64, len(ds.Columns)),
histColl: ds.statisticTable.GenerateHistCollFromColumnInfo(ds.Columns, ds.schema.Columns),
usePseudoStats: ds.statisticTable.Pseudo,
}
for i, col := range ds.Columns {
Expand All @@ -126,21 +124,22 @@ func (ds *DataSource) getStatsByFilter(conds expression.CNFExprs) *statsInfo {
profile.cardinality[i] = profile.count * distinctFactor
}
}
ds.stats = profile
selectivity, err := profile.histColl.Selectivity(ds.ctx, conds)
histColl = ds.statisticTable.GenerateHistCollFromColumnInfo(ds.Columns, ds.schema.Columns)
selectivity, err := histColl.Selectivity(ds.ctx, conds)
if err != nil {
log.Warnf("An error happened: %v, we have to use the default selectivity", err.Error())
selectivity = selectionFactor
}
return profile.scale(selectivity)
return profile.scale(selectivity), histColl
}

func (ds *DataSource) deriveStats() (*statsInfo, error) {
// PushDownNot here can convert query 'not (a != 1)' to 'a = 1'.
for i, expr := range ds.pushedDownConds {
ds.pushedDownConds[i] = expression.PushDownNot(nil, expr, false)
}
ds.stats = ds.getStatsByFilter(ds.pushedDownConds)
statsInfo, histColl := ds.getStatsByFilter(ds.pushedDownConds)
ds.stats = statsInfo
for _, path := range ds.possibleAccessPaths {
if path.isTablePath {
noIntervalRanges, err := ds.deriveTablePathStats(path)
Expand All @@ -155,7 +154,7 @@ func (ds *DataSource) deriveStats() (*statsInfo, error) {
}
continue
}
noIntervalRanges, err := ds.deriveIndexPathStats(path)
noIntervalRanges, err := ds.deriveIndexPathStats(path, histColl)
if err != nil {
return nil, errors.Trace(err)
}
Expand All @@ -166,6 +165,8 @@ func (ds *DataSource) deriveStats() (*statsInfo, error) {
break
}
}
histColl.Count = int64(ds.stats.count)
ds.stats.histColl = histColl
return ds.stats, nil
}

Expand All @@ -174,7 +175,23 @@ func (p *LogicalSelection) deriveStats() (*statsInfo, error) {
if err != nil {
return nil, errors.Trace(err)
}
p.stats = childProfile.scale(selectionFactor)
factor := selectionFactor
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how about s/factor/selecity/?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Em, selecity is not a word, you mean selectivity?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah.

if p.ctx.GetSessionVars().OptimizerSelectivityLevel == 1 && childProfile.histColl != nil {
factor, err = childProfile.histColl.Selectivity(p.ctx, p.Conditions)
if err != nil {
return nil, errors.Trace(err)
}
}
p.stats = childProfile.scale(factor)
if childProfile.histColl != nil {
p.stats.histColl = &statistics.HistColl{
Count: int64(p.stats.count),
Columns: childProfile.histColl.Columns,
Indices: childProfile.histColl.Indices,
Idx2ColumnIDs: childProfile.histColl.Idx2ColumnIDs,
ColID2IdxID: childProfile.histColl.ColID2IdxID,
}
}
return p.stats, nil
}

Expand Down Expand Up @@ -254,9 +271,58 @@ func (p *LogicalProjection) deriveStats() (*statsInfo, error) {
cols := expression.ExtractColumns(expr)
p.stats.cardinality[i] = getCardinality(cols, p.children[0].Schema(), childProfile)
}
// If we enables the enhance selectivity we'll try to maintain the histogram.
if p.ctx.GetSessionVars().OptimizerSelectivityLevel >= 1 && !childProfile.histColl.Pseudo {
p.deriveHistStats(childProfile)
}
return p.stats, nil
}

func (p *LogicalProjection) deriveHistStats(childProfile *statsInfo) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add comments in the function.

colHistMap := make(map[int64]*statistics.Column)
colIDMap := make(map[int64]int64)
childHist := childProfile.histColl
for i, expr := range p.Exprs {
col, ok := expr.(*expression.Column)
if !ok {
continue
}
colHist, ok := childHist.Columns[col.UniqueID]
if !ok {
continue
}
colHistMap[p.schema.Columns[i].UniqueID] = colHist
colIDMap[col.UniqueID] = p.schema.Columns[i].UniqueID
}

colID2IdxID := make(map[int64]int64)
idx2ColumnIDs := make(map[int64][]int64)
idxHistMap := make(map[int64]*statistics.Index)
for id, colIDs := range childHist.Idx2ColumnIDs {
newIDList := make([]int64, 0, len(colIDs))
for _, id := range colIDs {
if newID, ok := colIDMap[id]; ok {
newIDList = append(newIDList, newID)
continue
}
break
}
if len(newIDList) == 0 {
continue
}
colID2IdxID[newIDList[0]] = id
idx2ColumnIDs[id] = newIDList
idxHistMap[id] = childHist.Indices[id]
}
p.stats.histColl = &statistics.HistColl{
Columns: colHistMap,
Indices: idxHistMap,
Idx2ColumnIDs: idx2ColumnIDs,
ColID2IdxID: colID2IdxID,
Count: childHist.Count,
}
}

func (la *LogicalAggregation) deriveStats() (*statsInfo, error) {
childProfile, err := la.children[0].deriveStats()
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion sessionctx/variable/varsutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ func ValidateSetSystemVar(vars *SessionVars, name string, value string) (string,
TiDBDistSQLScanConcurrency,
TiDBIndexSerialScanConcurrency, TiDBDDLReorgWorkerCount,
TiDBBackoffLockFast, TiDBMaxChunkSize,
TiDBDMLBatchSize, TiDBOptimizerSelectivityLevel:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why remove it?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This validation is not right. The default value of it is zero. But here just checks what it should not be zero.

TiDBDMLBatchSize:
v, err := strconv.Atoi(value)
if err != nil {
return value, ErrWrongTypeForVar.GenByArgs(name)
Expand Down
4 changes: 2 additions & 2 deletions statistics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ func getOrdinalOfRangeCond(sc *stmtctx.StatementContext, ran *ranger.Range) int
}

// GenerateHistCollFromColumnInfo generates a new HistColl whose ColID2IdxID and IdxID2ColIDs is built from the given parameter.
func (coll *HistColl) GenerateHistCollFromColumnInfo(infos []*model.ColumnInfo, columns []*expression.Column) HistColl {
func (coll *HistColl) GenerateHistCollFromColumnInfo(infos []*model.ColumnInfo, columns []*expression.Column) *HistColl {
newColHistMap := make(map[int64]*Column)
colInfoID2UniqueID := make(map[int64]int64)
colNames2UniqueID := make(map[string]int64)
Expand Down Expand Up @@ -511,7 +511,7 @@ func (coll *HistColl) GenerateHistCollFromColumnInfo(infos []*model.ColumnInfo,
newIdxHistMap[idxHist.ID] = idxHist
idx2Columns[idxHist.ID] = ids
}
newColl := HistColl{
newColl := &HistColl{
PhysicalID: coll.PhysicalID,
HavePhysicalID: coll.HavePhysicalID,
Pseudo: coll.Pseudo,
Expand Down