Skip to content

Commit d8b313a

Browse files
author
Dmitry Lenev
committed
A temporary workaround for bug #56405 "Deadlock in the
MDL deadlock detector". Deadlock could have occurred when workload containing mix of DML, DDL and FLUSH TABLES statements affecting same set of tables was executed in heavily concurrent environment. This deadlock occurred when several connections tried to perform deadlock detection in metadata locking subsystem. The first connection started traversing wait-for graph, encountered sub-graph representing wait for flush, acquired LOCK_open and dived into sub-graph inspection. When it has encounterd sub-graph corresponding to wait for metadata lock and blocked while trying to acquire rd-lock on MDL_lock::m_rwlock (*) protecting this subgraph, since some other thread had wr-lock on it. When this wr-lock was released it could have happened (if there was other pending wr-lock against this rwlock) that rd-lock from the first connection was left unsatisfied but at the same time new rd-lock request from the second connection sneaked in and was satisfied (for this to be possible second rd- request should come exactly after wr-lock is released but before pending wr-lock manages to grab rwlock, which is possible both on Linux and in our own rwlock implementation). If this second connection continued traversing wait-for graph and encountered sub-graph representing wait for flush it tried to acquire LOCK_open and thus deadlock was created. This patch tries to workaround this problem but not allowing deadlock detector to lock LOCK_open mutex if some other thread doing deadlock detection already owns it and current search depth is greater than 0. Instead deadlock is reported. Other possible solutions are either known to have negative effects on performance or require much more time for proper implementation and testing. No test case is provided as this bug is very hard to repeat in MTR environment but is repeatable with the help of RQG tests. --BZR-- revision-id: [email protected] property-branch-nick: mysql-5.5.6-m3-release property-file-info: ld7:file_id40:mdl.cc-20080523121737-j62pi0m62eaw1hq6-17:message261:Moved Deadlock_detection_visitor::m_current_search_depth to property-file-info: parent class to make it available in property-file-info: TABLE_SHARE::visit_subgraph(). property-file-info: Added MDL_wait_for_graph_visitor::abort_traversal() method property-file-info: which allows to abort traversal of a wait-for graph and property-file-info: report a deadlock.4:path10:sql/mdl.cced7:file_id39:mdl.h-20080523121748-o4y2wcq3maotb9do-17:message261:Moved Deadlock_detection_visitor::m_current_search_depth to property-file-info: parent class to make it available in property-file-info: TABLE_SHARE::visit_subgraph(). property-file-info: Added MDL_wait_for_graph_visitor::abort_traversal() method property-file-info: which allows to abort traversal of a wait-for graph and property-file-info: report a deadlock.4:path9:sql/mdl.hed7:file_id64:sp1f-sql_base.cc-19700101030959-w7tul2gb2n4jzayjwlslj3ybmf3uhk6a7:message151:Added dd_owns_lock_open counter and mutex protecting it to property-file-info: track number of connections which do deadlock detection and property-file-info: own or try to acquire LOCK_open.4:path15:sql/sql_base.cced7:file_id45:sql_base.h-20100331135644-cgcb6oowzqyx7fi3-127:message151:Added dd_owns_lock_open counter and mutex protecting it to property-file-info: track number of connections which do deadlock detection and property-file-info: own or try to acquire LOCK_open.4:path14:sql/sql_base.hed7:file_id61:sp1f-table.cc-19700101030959-nsxtem2adyqzwe6nz4cgrpcmts3o54v77:message215:Workaround bug #56405 but not allowing MDL deadlock detector property-file-info: to lock LOCK_open mutex if some other thread doing deadlock property-file-info: detection already owns it and current search depth is greater property-file-info: than 0. Instead report deadlock.4:path12:sql/table.ccee testament3-sha1: e192dbc29ae016554c132a6d1c1c9138f1ba46c7
1 parent 423bc1a commit d8b313a

5 files changed

Lines changed: 78 additions & 11 deletions

File tree

sql/mdl.cc

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,6 @@ class Deadlock_detection_visitor: public MDL_wait_for_graph_visitor
124124
Deadlock_detection_visitor(MDL_context *start_node_arg)
125125
: m_start_node(start_node_arg),
126126
m_victim(NULL),
127-
m_current_search_depth(0),
128127
m_found_deadlock(FALSE)
129128
{}
130129
virtual bool enter_node(MDL_context *node);
@@ -133,6 +132,8 @@ class Deadlock_detection_visitor: public MDL_wait_for_graph_visitor
133132
virtual bool inspect_edge(MDL_context *dest);
134133

135134
MDL_context *get_victim() const { return m_victim; }
135+
136+
void abort_traversal(MDL_context *node);
136137
private:
137138
/**
138139
Change the deadlock victim to a new one if it has lower deadlock
@@ -147,13 +148,6 @@ class Deadlock_detection_visitor: public MDL_wait_for_graph_visitor
147148
MDL_context *m_start_node;
148149
/** If a deadlock is found, the context that identifies the victim. */
149150
MDL_context *m_victim;
150-
/** Set to the 0 at start. Increased whenever
151-
we descend into another MDL context (aka traverse to the next
152-
wait-for graph node). When MAX_SEARCH_DEPTH is reached, we
153-
assume that a deadlock is found, even if we have not found a
154-
loop.
155-
*/
156-
uint m_current_search_depth;
157151
/** TRUE if we found a deadlock. */
158152
bool m_found_deadlock;
159153
/**
@@ -187,7 +181,7 @@ class Deadlock_detection_visitor: public MDL_wait_for_graph_visitor
187181

188182
bool Deadlock_detection_visitor::enter_node(MDL_context *node)
189183
{
190-
m_found_deadlock= ++m_current_search_depth >= MAX_SEARCH_DEPTH;
184+
m_found_deadlock= m_current_search_depth >= MAX_SEARCH_DEPTH;
191185
if (m_found_deadlock)
192186
{
193187
DBUG_ASSERT(! m_victim);
@@ -207,7 +201,6 @@ bool Deadlock_detection_visitor::enter_node(MDL_context *node)
207201

208202
void Deadlock_detection_visitor::leave_node(MDL_context *node)
209203
{
210-
--m_current_search_depth;
211204
if (m_found_deadlock)
212205
opt_change_victim_to(node);
213206
}
@@ -251,6 +244,21 @@ Deadlock_detection_visitor::opt_change_victim_to(MDL_context *new_victim)
251244
}
252245

253246

247+
/**
248+
Abort traversal of a wait-for graph and report a deadlock.
249+
250+
@param node Node which we were about to visit when abort
251+
was initiated.
252+
*/
253+
254+
void Deadlock_detection_visitor::abort_traversal(MDL_context *node)
255+
{
256+
DBUG_ASSERT(! m_victim);
257+
m_found_deadlock= TRUE;
258+
opt_change_victim_to(node);
259+
}
260+
261+
254262
/**
255263
Get a bit corresponding to enum_mdl_type value in a granted/waiting bitmaps
256264
and compatibility matrices.
@@ -2056,8 +2064,13 @@ bool MDL_lock::visit_subgraph(MDL_ticket *waiting_ticket,
20562064
are visiting it but this is OK: in the worst case we might do some
20572065
extra work and one more context might be chosen as a victim.
20582066
*/
2067+
++gvisitor->m_current_search_depth;
2068+
20592069
if (gvisitor->enter_node(src_ctx))
2070+
{
2071+
--gvisitor->m_current_search_depth;
20602072
goto end;
2073+
}
20612074

20622075
/*
20632076
We do a breadth-first search first -- that is, inspect all
@@ -2114,6 +2127,7 @@ bool MDL_lock::visit_subgraph(MDL_ticket *waiting_ticket,
21142127

21152128
end_leave_node:
21162129
gvisitor->leave_node(src_ctx);
2130+
--gvisitor->m_current_search_depth;
21172131

21182132
end:
21192133
mysql_prlock_unlock(&m_rwlock);

sql/mdl.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,10 @@ class MDL_wait_for_graph_visitor
385385

386386
virtual bool inspect_edge(MDL_context *dest) = 0;
387387
virtual ~MDL_wait_for_graph_visitor();
388-
MDL_wait_for_graph_visitor() :m_lock_open_count(0) {}
388+
MDL_wait_for_graph_visitor() :m_lock_open_count(0),
389+
m_current_search_depth(0)
390+
{ }
391+
virtual void abort_traversal(MDL_context *node) = 0;
389392
public:
390393
/**
391394
XXX, hack: During deadlock search, we may need to
@@ -396,6 +399,17 @@ class MDL_wait_for_graph_visitor
396399
LOCK_open since it has significant performance impacts.
397400
*/
398401
uint m_lock_open_count;
402+
/**
403+
Set to the 0 at start. Increased whenever
404+
we descend into another MDL context (aka traverse to the next
405+
wait-for graph node). When MAX_SEARCH_DEPTH is reached, we
406+
assume that a deadlock is found, even if we have not found a
407+
loop.
408+
409+
XXX: This member belongs to this class only temporarily until
410+
bug #56405 is fixed.
411+
*/
412+
uint m_current_search_depth;
399413
};
400414

401415
/**

sql/sql_base.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@ bool No_such_table_error_handler::safely_trapped_errors()
100100
TABLE_SHAREs, refresh_version and the table id counter.
101101
*/
102102
mysql_mutex_t LOCK_open;
103+
mysql_mutex_t LOCK_dd_owns_lock_open;
104+
uint dd_owns_lock_open= 0;
103105

104106
#ifdef HAVE_PSI_INTERFACE
105107
static PSI_mutex_key key_LOCK_open;
@@ -298,6 +300,7 @@ bool table_def_init(void)
298300
init_tdc_psi_keys();
299301
#endif
300302
mysql_mutex_init(key_LOCK_open, &LOCK_open, MY_MUTEX_INIT_FAST);
303+
mysql_mutex_init(NULL, &LOCK_dd_owns_lock_open, MY_MUTEX_INIT_FAST);
301304
oldest_unused_share= &end_of_unused_share;
302305
end_of_unused_share.prev= &oldest_unused_share;
303306

@@ -341,6 +344,7 @@ void table_def_free(void)
341344
table_def_inited= 0;
342345
/* Free table definitions. */
343346
my_hash_free(&table_def_cache);
347+
mysql_mutex_destroy(&LOCK_dd_owns_lock_open);
344348
mysql_mutex_destroy(&LOCK_open);
345349
}
346350
DBUG_VOID_RETURN;

sql/sql_base.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ enum enum_tdc_remove_table_type {TDC_RT_REMOVE_ALL, TDC_RT_REMOVE_NOT_OWN,
7171

7272
bool check_dup(const char *db, const char *name, TABLE_LIST *tables);
7373
extern mysql_mutex_t LOCK_open;
74+
extern mysql_mutex_t LOCK_dd_owns_lock_open;
75+
extern uint dd_owns_lock_open;
7476
bool table_cache_init(void);
7577
void table_cache_free(void);
7678
bool table_def_init(void);

sql/table.cc

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3085,7 +3085,30 @@ bool TABLE_SHARE::visit_subgraph(Wait_for_flush *wait_for_flush,
30853085
holding a write-lock on MDL_lock::m_rwlock.
30863086
*/
30873087
if (gvisitor->m_lock_open_count++ == 0)
3088+
{
3089+
/*
3090+
To circumvent bug #56405 "Deadlock in the MDL deadlock detector"
3091+
we don't try to lock LOCK_open mutex if some thread doing
3092+
deadlock detection already owns it and current search depth is
3093+
greater than 0. Instead we report a deadlock.
3094+
3095+
TODO/FIXME: The proper fix for this bug is to use rwlocks for
3096+
protection of table shares/instead of LOCK_open.
3097+
Unfortunately it requires more effort/has significant
3098+
performance effect.
3099+
*/
3100+
mysql_mutex_lock(&LOCK_dd_owns_lock_open);
3101+
if (gvisitor->m_current_search_depth > 0 && dd_owns_lock_open > 0)
3102+
{
3103+
mysql_mutex_unlock(&LOCK_dd_owns_lock_open);
3104+
--gvisitor->m_lock_open_count;
3105+
gvisitor->abort_traversal(src_ctx);
3106+
return TRUE;
3107+
}
3108+
++dd_owns_lock_open;
3109+
mysql_mutex_unlock(&LOCK_dd_owns_lock_open);
30883110
mysql_mutex_lock(&LOCK_open);
3111+
}
30893112

30903113
I_P_List_iterator <TABLE, TABLE_share> tables_it(used_tables);
30913114

@@ -3100,8 +3123,12 @@ bool TABLE_SHARE::visit_subgraph(Wait_for_flush *wait_for_flush,
31003123
goto end;
31013124
}
31023125

3126+
++gvisitor->m_current_search_depth;
31033127
if (gvisitor->enter_node(src_ctx))
3128+
{
3129+
--gvisitor->m_current_search_depth;
31043130
goto end;
3131+
}
31053132

31063133
while ((table= tables_it++))
31073134
{
@@ -3124,10 +3151,16 @@ bool TABLE_SHARE::visit_subgraph(Wait_for_flush *wait_for_flush,
31243151

31253152
end_leave_node:
31263153
gvisitor->leave_node(src_ctx);
3154+
--gvisitor->m_current_search_depth;
31273155

31283156
end:
31293157
if (gvisitor->m_lock_open_count-- == 1)
3158+
{
31303159
mysql_mutex_unlock(&LOCK_open);
3160+
mysql_mutex_lock(&LOCK_dd_owns_lock_open);
3161+
--dd_owns_lock_open;
3162+
mysql_mutex_unlock(&LOCK_dd_owns_lock_open);
3163+
}
31313164

31323165
return result;
31333166
}

0 commit comments

Comments
 (0)