-
Notifications
You must be signed in to change notification settings - Fork 411
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
MPP: update the state of building a hash table when createOnce throw exceptions #4202
Changes from 9 commits
242226f
1cc54ad
7203668
15a805b
f2e0b30
d50e693
21fde10
b4bd64e
863290e
eedcbd6
a12989d
10bf2a9
8ae5dcd
2b02d65
62d15f6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,7 +17,8 @@ namespace DB | |
namespace FailPoints | ||
{ | ||
extern const char exception_in_creating_set_input_stream[]; | ||
} | ||
extern const char exception_mpp_hash_build[]; | ||
} // namespace FailPoints | ||
namespace ErrorCodes | ||
{ | ||
extern const int SET_SIZE_LIMIT_EXCEEDED; | ||
|
@@ -108,9 +109,10 @@ void CreatingSetsBlockInputStream::createAll() | |
for (auto & elem : subqueries_for_sets) | ||
{ | ||
if (elem.second.join) | ||
elem.second.join->setFinishBuildTable(false); | ||
elem.second.join->setBuildTableState(Join::BuildTableState::WAITING); | ||
} | ||
} | ||
Stopwatch watch; | ||
auto thread_manager = newThreadManager(); | ||
for (auto & subqueries_for_sets : subqueries_for_sets_list) | ||
{ | ||
|
@@ -129,27 +131,31 @@ void CreatingSetsBlockInputStream::createAll() | |
thread_manager->wait(); | ||
|
||
if (!exception_from_workers.empty()) | ||
{ | ||
LOG_FMT_ERROR(log, "Creating all tasks of {} takes {} sec with exception and rethrow the first of total {} exceptions", mpp_task_id.toString(), watch.elapsedSeconds(), exception_from_workers.size()); | ||
std::rethrow_exception(exception_from_workers.front()); | ||
} | ||
LOG_FMT_DEBUG(log, "Creating all tasks of {} takes {} sec. ", mpp_task_id.toString(), watch.elapsedSeconds()); | ||
|
||
created = true; | ||
} | ||
} | ||
|
||
void CreatingSetsBlockInputStream::createOne(SubqueryForSet & subquery) | ||
{ | ||
auto log_msg = fmt::format("{} for task {}", | ||
subquery.set ? "Creating set. " : subquery.join ? "Creating join. " | ||
: subquery.table ? "Filling temporary table. " | ||
: "null subquery", | ||
mpp_task_id.toString()); | ||
Stopwatch watch; | ||
try | ||
{ | ||
LOG_DEBUG(log, | ||
(subquery.set ? "Creating set. " : "") | ||
<< (subquery.join ? "Creating join. " : "") << (subquery.table ? "Filling temporary table. " : "") << " for task " | ||
<< mpp_task_id.toString()); | ||
Stopwatch watch; | ||
|
||
LOG_FMT_DEBUG(log, "{}", log_msg); | ||
BlockOutputStreamPtr table_out; | ||
if (subquery.table) | ||
table_out = subquery.table->write({}, {}); | ||
|
||
|
||
bool done_with_set = !subquery.set; | ||
bool done_with_join = !subquery.join; | ||
bool done_with_table = !subquery.table; | ||
|
@@ -164,7 +170,7 @@ void CreatingSetsBlockInputStream::createOne(SubqueryForSet & subquery) | |
{ | ||
if (isCancelled()) | ||
{ | ||
LOG_DEBUG(log, "Query was cancelled during set / join or temporary table creation."); | ||
LOG_FMT_DEBUG(log, "Query was cancelled during set / join or temporary table creation."); | ||
return; | ||
} | ||
|
||
|
@@ -209,7 +215,10 @@ void CreatingSetsBlockInputStream::createOne(SubqueryForSet & subquery) | |
|
||
|
||
if (subquery.join) | ||
subquery.join->setFinishBuildTable(true); | ||
{ | ||
FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::exception_mpp_hash_build); | ||
subquery.join->setBuildTableState(Join::BuildTableState::SUCCEED); | ||
} | ||
|
||
if (table_out) | ||
table_out->writeSuffix(); | ||
|
@@ -243,20 +252,20 @@ void CreatingSetsBlockInputStream::createOne(SubqueryForSet & subquery) | |
msg << "In " << watch.elapsedSeconds() << " sec. "; | ||
msg << "using " << std::to_string(subquery.join == nullptr ? 1 : subquery.join->getBuildConcurrency()) << " threads "; | ||
|
||
if (log != nullptr) | ||
LOG_DEBUG(log, msg.rdbuf()); | ||
else | ||
LOG_DEBUG(log, msg.rdbuf()); | ||
LOG_FMT_DEBUG(log, "{}", msg.rdbuf()->str()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: use fmt for generating |
||
} | ||
else | ||
{ | ||
LOG_DEBUG(log, "Subquery has empty result for task " << mpp_task_id.toString() << "."); | ||
LOG_FMT_DEBUG(log, "Subquery has empty result for task {}. ", mpp_task_id.toString()); | ||
} | ||
} | ||
catch (std::exception & e) | ||
catch (...) | ||
{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not merge the two There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated |
||
std::unique_lock<std::mutex> lock(exception_mutex); | ||
exception_from_workers.push_back(std::current_exception()); | ||
if (subquery.join) | ||
subquery.join->setBuildTableState(Join::BuildTableState::FAILED); | ||
LOG_FMT_ERROR(log, "{} throw exception: {} In {} sec. ", log_msg, getCurrentExceptionMessage(false, true), watch.elapsedSeconds()); | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -81,5 +81,37 @@ ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Exchang | |
=> DBGInvoke __disable_fail_point(exception_during_mpp_non_root_task_run) | ||
=> DBGInvoke __disable_fail_point(exception_during_mpp_close_tunnel) | ||
|
||
## exception during mpp hash build | ||
## desc format='brief' select t1.id from test.t t1 join test.t t2 on t1.id = t2.id and t1.id <2 join (select id from test.t group by id) t3 on t2.id=t3.id; | ||
## +-----------------------------------------+---------+-------------------+---------------+-------------------------------------------------------------------------+ | ||
## | id | estRows | task | access object | operator info | | ||
## +-----------------------------------------+---------+-------------------+---------------+-------------------------------------------------------------------------+ | ||
## | Projection | 0.99 | root | | test.t.id | | ||
## | └─TableReader | 0.99 | root | | data:ExchangeSender | | ||
## | └─ExchangeSender | 0.99 | batchCop[tiflash] | | ExchangeType: PassThrough | | ||
## | └─HashJoin | 0.99 | batchCop[tiflash] | | inner join, equal:[eq(test.t.id, test.t.id)] | | ||
## | ├─HashJoin(Build) | 0.99 | batchCop[tiflash] | | inner join, equal:[eq(test.t.id, test.t.id)] | | ||
## | │ ├─ExchangeReceiver(Build) | 1.00 | batchCop[tiflash] | | | | ||
## | │ │ └─ExchangeSender | 1.00 | batchCop[tiflash] | | ExchangeType: HashPartition, Hash Cols: [name: test.t.id, collate: N/A] | | ||
## | │ │ └─Selection | 1.00 | batchCop[tiflash] | | lt(test.t.id, 2), not(isnull(test.t.id)) | | ||
## | │ │ └─TableFullScan | 3.00 | batchCop[tiflash] | table:t1 | keep order:false, stats:pseudo | | ||
## | │ └─ExchangeReceiver(Probe) | 1.00 | batchCop[tiflash] | | | | ||
## | │ └─ExchangeSender | 1.00 | batchCop[tiflash] | | ExchangeType: HashPartition, Hash Cols: [name: test.t.id, collate: N/A] | | ||
## | │ └─Selection | 1.00 | batchCop[tiflash] | | lt(test.t.id, 2), not(isnull(test.t.id)) | | ||
## | │ └─TableFullScan | 3.00 | batchCop[tiflash] | table:t2 | keep order:false, stats:pseudo | | ||
## | └─Projection(Probe) | 2.40 | batchCop[tiflash] | | test.t.id | | ||
## | └─HashAgg | 2.40 | batchCop[tiflash] | | group by:test.t.id, funcs:firstrow(test.t.id)->test.t.id | | ||
## | └─ExchangeReceiver | 2.40 | batchCop[tiflash] | | | | ||
## | └─ExchangeSender | 2.40 | batchCop[tiflash] | | ExchangeType: HashPartition, Hash Cols: [name: test.t.id, collate: N/A] | | ||
## | └─HashAgg | 2.40 | batchCop[tiflash] | | group by:test.t.id, | | ||
## | └─Selection | 3.00 | batchCop[tiflash] | | not(isnull(test.t.id)) | | ||
## | └─TableFullScan | 3.00 | batchCop[tiflash] | table:t | keep order:false, stats:pseudo | | ||
## +-----------------------------------------+---------+-------------------+---------------+-------------------------------------------------------------------------+ | ||
## ensure build1, build2-probe1, probe2 in the CreatingSets, test the bug where build1 throw exception but not change the build state, thus block the build2-probe1, at last this query hangs. | ||
=> DBGInvoke __enable_fail_point(exception_mpp_hash_build) | ||
mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; set @@tidb_broadcast_join_threshold_count=0; set @@tidb_broadcast_join_threshold_size=0; select t1.id from test.t t1 join test.t t2 on t1.id = t2.id and t1.id <2 join (select id from test.t group by id) t3 on t2.id=t3.id; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this test let TiFlash hang before this pr? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure, the test progress below guarantee this. also the integration test ensure this. |
||
ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Fail point FailPoints::exception_mpp_hash_build is triggered. | ||
=> DBGInvoke __disable_fail_point(exception_mpp_hash_build) | ||
|
||
# Clean up. | ||
mysql> drop table if exists test.t |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
my suggestion is:
then the fmt will only run when log_level <= debug.