-
Notifications
You must be signed in to change notification settings - Fork 558
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[teamd]: Clean teamd process if LAG creation fails #2888
Changes from 4 commits
9c6877f
111d91a
706b5e6
4bb1aba
fb5176a
2072d0c
5c88f99
fc0582d
87225ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -307,6 +307,8 @@ void TeamMgr::doLagTask(Consumer &consumer) | |
{ | ||
if (addLag(alias, min_links, fallback, fast_rate) == task_need_retry) | ||
{ | ||
// If LAG creation fails, we need to clean up any potentially orphaned teamd processes | ||
removeLag(alias); | ||
it++; | ||
continue; | ||
} | ||
|
@@ -654,10 +656,46 @@ bool TeamMgr::removeLag(const string &alias) | |
|
||
stringstream cmd; | ||
string res; | ||
pid_t pid; | ||
|
||
cmd << TEAMD_CMD << " -k -t " << shellquote(alias); | ||
EXEC_WITH_ERROR_THROW(cmd.str(), res); | ||
|
||
try | ||
{ | ||
std::stringstream cmd; | ||
cmd << "cat " << shellquote("/var/run/teamd/" + alias + ".pid"); | ||
EXEC_WITH_ERROR_THROW(cmd.str(), res); | ||
} | ||
catch (const std::exception &e) | ||
{ | ||
SWSS_LOG_NOTICE("Failed to remove non-existent port channel %s pid...", alias.c_str()); | ||
return false; | ||
} | ||
Comment on lines
+661
to
+671
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thinking about this a bit more, IMO this is internal implementation details of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Teammgrd already reads this file in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
try | ||
{ | ||
pid = static_cast<pid_t>(std::stoul(res, nullptr, 10)); | ||
SWSS_LOG_INFO("Read port channel %s pid %d", alias.c_str(), pid); | ||
} | ||
catch (const std::exception &e) | ||
{ | ||
SWSS_LOG_ERROR("Failed to read port channel %s pid: %s", alias.c_str(), e.what()); | ||
return false; | ||
} | ||
|
||
try | ||
{ | ||
std::stringstream cmd; | ||
cmd << "kill -TERM " << pid; | ||
EXEC_WITH_ERROR_THROW(cmd.str(), res); | ||
} | ||
catch (const std::exception &e) | ||
{ | ||
SWSS_LOG_ERROR("Failed to send SIGTERM to port channel %s pid %d: %s", alias.c_str(), pid, e.what()); | ||
return false; | ||
} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to do this? It should be part of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, since |
||
SWSS_LOG_NOTICE("Stop port channel %s", alias.c_str()); | ||
|
||
return true; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#include "gtest/gtest.h" | ||
#include "../mock_table.h" | ||
#include "teammgr.h" | ||
|
||
extern int (*callback)(const std::string &cmd, std::string &stdout); | ||
extern std::vector<std::string> mockCallArgs; | ||
|
||
int cb(const std::string &cmd, std::string &stdout) | ||
{ | ||
mockCallArgs.push_back(cmd); | ||
if (cmd.find("/usr/bin/teamd -r -t PortChannel1") != std::string::npos) | ||
{ | ||
return 1; | ||
} | ||
else if (cmd.find("cat /var/run/teamd/PortChannel1.pid") != std::string::npos) | ||
{ | ||
stdout = "1234"; | ||
return 0; | ||
} | ||
return 0; | ||
} | ||
|
||
namespace teammgr_ut | ||
{ | ||
struct TeamMgrTest : public ::testing::Test | ||
{ | ||
std::shared_ptr<swss::DBConnector> m_config_db; | ||
std::shared_ptr<swss::DBConnector> m_app_db; | ||
std::shared_ptr<swss::DBConnector> m_state_db; | ||
std::vector<TableConnector> cfg_lag_tables; | ||
|
||
virtual void SetUp() override | ||
{ | ||
testing_db::reset(); | ||
m_config_db = std::make_shared<swss::DBConnector>("CONFIG_DB", 0); | ||
m_app_db = std::make_shared<swss::DBConnector>("APPL_DB", 0); | ||
m_state_db = std::make_shared<swss::DBConnector>("STATE_DB", 0); | ||
|
||
m_config_db.get()->hset("DEVICE_METADATA|localhost", "mac", "01:23:45:67:89:ab"); | ||
|
||
TableConnector conf_lag_table(m_config_db.get(), CFG_LAG_TABLE_NAME); | ||
TableConnector conf_lag_member_table(m_config_db.get(), CFG_LAG_MEMBER_TABLE_NAME); | ||
TableConnector state_port_table(m_state_db.get(), STATE_PORT_TABLE_NAME); | ||
|
||
std::vector<TableConnector> tables = { | ||
conf_lag_table, | ||
conf_lag_member_table, | ||
state_port_table | ||
}; | ||
|
||
cfg_lag_tables = tables; | ||
mockCallArgs.clear(); | ||
callback = cb; | ||
} | ||
}; | ||
|
||
TEST_F(TeamMgrTest, testProcessKilledAfterAddLagFailure) | ||
{ | ||
swss::TeamMgr teammgr(m_config_db.get(), m_app_db.get(), m_state_db.get(), cfg_lag_tables); | ||
swss::Table cfg_lag_table = swss::Table(m_config_db.get(), CFG_LAG_TABLE_NAME); | ||
cfg_lag_table.set("PortChannel1", { { "admin_status", "up" }, | ||
{ "mtu", "9100" }, | ||
{ "lacp_key", "auto" }, | ||
{ "min_links", "2" } }); | ||
teammgr.addExistingData(&cfg_lag_table); | ||
int kill_cmd_called = 0; | ||
for (auto cmd : mockCallArgs){ | ||
if (cmd.find("kill -TERM 1234") != std::string::npos){ | ||
kill_cmd_called++; | ||
} | ||
} | ||
ASSERT_EQ(kill_cmd_called, 1); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This could be causing issues. IIUC if the process actually wasn't running, then
teamd -k -t PortChannel.1
would fail. So in the case whereaddLag()
fails, but noteamd ...
running, thenteamd -k -t PortChannel.
would fail, as a result an exception will be thrown, could you check?