From 4fb3bea169ab592c62d210e4c8368a43926a8335 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Sat, 8 Feb 2025 10:25:22 +0800 Subject: [PATCH] br: fix br integration test (#53836) (#56081) close pingcap/tidb#53835 --- br/pkg/backup/client.go | 3 ++ br/pkg/utils/backoff.go | 5 +++ br/tests/br_file_corruption/run.sh | 63 ++++++++++++++++++++++++++++ br/tests/br_file_corruption/workload | 12 ++++++ br/tests/br_full_ddl/run.sh | 18 ++++++++ br/tests/br_pitr/run.sh | 51 ++++++++++++++++++++++ 6 files changed, 152 insertions(+) create mode 100644 br/tests/br_file_corruption/run.sh create mode 100644 br/tests/br_file_corruption/workload diff --git a/br/pkg/backup/client.go b/br/pkg/backup/client.go index fe7a72a5c01d6..fa5c216a72101 100644 --- a/br/pkg/backup/client.go +++ b/br/pkg/backup/client.go @@ -1001,6 +1001,9 @@ func (bc *Client) findRegionLeader(ctx context.Context, key []byte, isRawKv bool // in order to find the correct region. key = codec.EncodeBytesExt([]byte{}, key, isRawKv) for i := 1; i < 100; i++ { + if ctx.Err() != nil { + return nil, ctx.Err() + } // better backoff. region, err := bc.mgr.GetPDClient().GetRegion(ctx, key) if err != nil || region == nil { diff --git a/br/pkg/utils/backoff.go b/br/pkg/utils/backoff.go index f2d6ddfba2f13..0294136f7539f 100644 --- a/br/pkg/utils/backoff.go +++ b/br/pkg/utils/backoff.go @@ -185,6 +185,11 @@ func (bo *importerBackoffer) NextBackoff(err error) time.Duration { } } } + failpoint.Inject("set-import-attempt-to-one", func(_ failpoint.Value) { + if bo.attempt > 1 { + bo.attempt = 1 + } + }) if bo.delayTime > bo.maxDelayTime { return bo.maxDelayTime } diff --git a/br/tests/br_file_corruption/run.sh b/br/tests/br_file_corruption/run.sh new file mode 100644 index 0000000000000..5c9b9510af27e --- /dev/null +++ b/br/tests/br_file_corruption/run.sh @@ -0,0 +1,63 @@ +#!/bin/sh +# +# Copyright 2024 PingCAP, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eux + +DB="$TEST_NAME" +TABLE="usertable" +CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) + +run_sql "CREATE DATABASE $DB;" +go-ycsb load mysql -P $CUR/workload -p mysql.host=$TIDB_IP -p mysql.port=$TIDB_PORT -p mysql.user=root -p mysql.db=$DB +run_br --pd $PD_ADDR backup full -s "local://$TEST_DIR/$DB" + +# Replace the single file manipulation with a loop over all .sst files +for filename in $(find $TEST_DIR/$DB -name "*.sst"); do + filename_temp="${filename}_temp" + filename_bak="${filename}_bak" + echo "corruption" > "$filename_temp" + cat "$filename" >> "$filename_temp" + mv "$filename" "$filename_bak" +done + +# need to drop db otherwise restore will fail because of cluster not fresh but not the expected issue +run_sql "DROP DATABASE IF EXISTS $DB;" + +# file lost +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/utils/set-import-attempt-to-one=return(true)" +restore_fail=0 +run_br --pd $PD_ADDR restore full -s "local://$TEST_DIR/$DB" || restore_fail=1 +export GO_FAILPOINTS="" +if [ $restore_fail -ne 1 ]; then + echo 'expect restore to fail on file lost but succeed' + exit 1 +fi +run_sql "DROP DATABASE IF EXISTS $DB;" + +# file corruption +for filename in $(find $TEST_DIR/$DB -name "*.sst_temp"); do + mv "$filename" "${filename%_temp}" + truncate -s -11 "${filename%_temp}" +done + +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/utils/set-import-attempt-to-one=return(true)" +restore_fail=0 +run_br --pd $PD_ADDR restore full -s "local://$TEST_DIR/$DB" || restore_fail=1 +export GO_FAILPOINTS="" +if [ $restore_fail -ne 1 ]; then + echo 'expect restore to fail on file corruption but succeed' + exit 1 +fi diff --git a/br/tests/br_file_corruption/workload b/br/tests/br_file_corruption/workload new file mode 100644 index 0000000000000..e3fadf9a3d068 --- /dev/null +++ b/br/tests/br_file_corruption/workload @@ -0,0 +1,12 @@ +recordcount=10000 +operationcount=0 +workload=core + +readallfields=true + +readproportion=0 +updateproportion=0 +scanproportion=0 +insertproportion=0 + +requestdistribution=uniform diff --git a/br/tests/br_full_ddl/run.sh b/br/tests/br_full_ddl/run.sh index e0871e91dd589..9f3ab963193b4 100755 --- a/br/tests/br_full_ddl/run.sh +++ b/br/tests/br_full_ddl/run.sh @@ -22,6 +22,7 @@ LOG=/$TEST_DIR/backup.log RESTORE_LOG=LOG=/$TEST_DIR/restore.log BACKUP_STAT=/$TEST_DIR/backup_stat RESOTRE_STAT=/$TEST_DIR/restore_stat +res_file="$TEST_DIR/sql_res.$TEST_NAME.txt" run_sql "CREATE DATABASE $DB;" go-ycsb load mysql -P tests/$TEST_NAME/workload -p mysql.host=$TIDB_IP -p mysql.port=$TIDB_PORT -p mysql.user=root -p mysql.db=$DB @@ -38,6 +39,23 @@ for i in $(seq $DDL_COUNT); do fi done +# wait until the index creation/drop is done +retry_cnt=0 +while true; do + run_sql "ADMIN SHOW DDL JOBS WHERE DB_NAME = '$DB' AND TABLE_NAME = '$TABLE' AND STATE != 'synced';" + if grep -Fq "1. row" $res_file; then + cat $res_file + retry_cnt=$((retry_cnt+1)) + if [ "$retry_cnt" -gt 50 ]; then + echo 'the wait lag is too large' + exit 1 + fi + continue + fi + + break +done + # run analyze to generate stats run_sql "analyze table $DB.$TABLE;" # record field0's stats and remove last_update_version diff --git a/br/tests/br_pitr/run.sh b/br/tests/br_pitr/run.sh index 96faac6ef88ae..9421eef972802 100644 --- a/br/tests/br_pitr/run.sh +++ b/br/tests/br_pitr/run.sh @@ -109,3 +109,54 @@ run_sql "select * from mysql.gc_delete_range_done" run_sql "select count(*) DELETE_RANGE_CNT from (select * from mysql.gc_delete_range union all select * from mysql.gc_delete_range_done) del_range group by ts order by DELETE_RANGE_CNT desc limit 1;" expect_delete_range=$(($incremental_delete_range_count-$prepare_delete_range_count)) check_contains "DELETE_RANGE_CNT: $expect_delete_range" + +# start a new cluster for corruption +echo "restart a services" +restart_services + +file_corruption() { + echo "corrupt the whole log files" + for filename in $(find $TEST_DIR/$PREFIX/log -regex ".*\.log" | grep -v "schema-meta"); do + echo "corrupt the log file $filename" + filename_temp=$filename"_temp" + echo "corruption" > $filename_temp + cat $filename >> $filename_temp + mv $filename_temp $filename + truncate -s -11 $filename + done +} + +# file corruption +file_corruption +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/utils/set-import-attempt-to-one=return(true)" +restore_fail=0 +run_br --pd $PD_ADDR restore point -s "local://$TEST_DIR/$PREFIX/log" --full-backup-storage "local://$TEST_DIR/$PREFIX/full" || restore_fail=1 +export GO_FAILPOINTS="" +if [ $restore_fail -ne 1 ]; then + echo 'pitr success on file corruption' + exit 1 +fi + +# start a new cluster for corruption +echo "restart a services" +restart_services + +file_lost() { + echo "lost the whole log files" + for filename in $(find $TEST_DIR/$PREFIX/log -regex ".*\.log" | grep -v "schema-meta"); do + echo "lost the log file $filename" + filename_temp=$filename"_temp" + mv $filename $filename_temp + done +} + +# file lost +file_lost +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/utils/set-import-attempt-to-one=return(true)" +restore_fail=0 +run_br --pd $PD_ADDR restore point -s "local://$TEST_DIR/$PREFIX/log" --full-backup-storage "local://$TEST_DIR/$PREFIX/full" || restore_fail=1 +export GO_FAILPOINTS="" +if [ $restore_fail -ne 1 ]; then + echo 'pitr success on file lost' + exit 1 +fi