Experiment sync (#680)

* sync * rollback
Samsung · Feb 17, 2025 · ff602e0 · ff602e0
1 parent 1655d3b
commit ff602e0
Show file tree

Hide file tree

Showing 6 changed files with 75 additions and 39 deletions.
diff --git a/experiment/main.py b/experiment/main.py
@@ -142,10 +142,22 @@ def main(cred_data_location: str,
 
     print(f"Memory before search / compile: {LogCallback.get_memory_info()}")
 
+    hp_dict = {
+        "value_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.41),
+        "line_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.41),
+        "variable_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.46),
+        "dense_a_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.2),
+        "dense_b_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.18),
+    }
     log_callback = LogCallback()
     if use_tuner:
-        tuner = kt.GridSearch(
-            hypermodel=MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape),
+        print(f"Tuner initial dict:{hp_dict}")
+        tuner_kwargs = {k: v[0] for k, v in hp_dict.items()}
+        print(f"Tuner kwargs:{tuner_kwargs}")
+
+        tuner = kt.BayesianOptimization(
+            hypermodel=MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape,
+                               **tuner_kwargs),
             objective='val_loss',
             directory=str(dir_path / f"{current_time}.tuner"),
             project_name='ml_tuning',
@@ -167,11 +179,17 @@ def main(cred_data_location: str,
         print("Best Hyperparameters:")
         for k, v in tuner.get_best_hyperparameters()[0].values.items():
             print(f"{k}: {v}")
-        keras_model = tuner.get_best_models()[0]
+        param_kwargs = {k: float(v) for k, v in tuner.get_best_hyperparameters()[0].values.items() if k in hp_dict}
         del tuner
     else:
-        keras_model = MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape,
-                              x_full_features.shape).build()
+        print(f"Model is trained with params from dict:{hp_dict}")
+        param_kwargs = {k: v[1] for k, v in hp_dict.items()}
+
+    print(f"Model hyper parameters: {param_kwargs}")
+
+    # repeat train step to obtain actual history chart
+    keras_model = MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape,
+                          **param_kwargs).build()
 
     early_stopping = EarlyStopping(monitor="val_loss",
                                    patience=patience,
@@ -197,6 +215,9 @@ def main(cred_data_location: str,
                                   callbacks=[early_stopping, model_checkpoint, log_callback],
                                   use_multiprocessing=True)
 
+    # if best_val_loss is not None and best_val_loss + 0.00001 < early_stopping.best:
+    #     print(f"CHECK BEST TUNER EARLY STOP : {best_val_loss} vs CURRENT: {early_stopping.best}")
+
     print(f"Memory after train: {LogCallback.get_memory_info()}")
 
     with open(dir_path / f"{current_time}.history.pickle", "wb") as f:
@@ -231,9 +252,9 @@ def main(cred_data_location: str,
 
     onnx_model_file = pathlib.Path(__file__).parent.parent / "credsweeper" / "ml_model" / "ml_model.onnx"
     # convert the model to onnx right now
-    command = f"{sys.executable} -m tf2onnx.convert --saved-model {model_file_name.absolute()}" \
-              f" --output {str(onnx_model_file)} --verbose"
-    subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)
+    convert_args = f"{sys.executable} -m tf2onnx.convert --saved-model {model_file_name.absolute()}" \
+                   f" --output {str(onnx_model_file)} --verbose"
+    subprocess.check_call(convert_args, shell=True, cwd=pathlib.Path(__file__).parent)
     with open(onnx_model_file, "rb") as f:
         onnx_md5 = hashlib.md5(f.read()).hexdigest()
         print(f"ml_model.onnx:{onnx_md5}")
@@ -294,7 +315,7 @@ def main(cred_data_location: str,
     parser.add_argument("--tuner", help="use keras tuner", dest="use_tuner", action="store_true")
     args = parser.parse_args()
 
-    fixed_seed = 20250117
+    fixed_seed = 20250124
     print(f"Fixed seed:{fixed_seed}")
     tf.random.set_seed(fixed_seed)
     np.random.seed(fixed_seed)
@@ -306,6 +327,7 @@ def main(cred_data_location: str,
     command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_model.onnx"
     subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)
 
+    print(args)  # dbg
     _model_file_name = main(cred_data_location=args.cred_data_location,
                             jobs=int(args.jobs),
                             epochs=int(args.epochs),

diff --git a/experiment/main.sh b/experiment/main.sh
@@ -33,6 +33,8 @@ export PYTHONPATH="${CREDSWEEPER_DIR}":$PYTHONPATH
 
 # check whether current version
 "${CREDSWEEPER_DIR}"/.venv/bin/python -m credsweeper --banner
+git log -1
+git status
 
 WORK_DIR="${CREDSWEEPER_DIR}/experiment"
 cd "${WORK_DIR}"
@@ -43,13 +45,13 @@ mkdir -vp "${RESULT_DIR}"
 #TUNER=--tuner
 # set env DOC to apply doc dataset
 #DOC=--doc
-"${CREDSWEEPER_DIR}"/.venv/bin/python main.py --data "${CREDDATA_DIR}" --jobs ${JOBS} ${TUNER} ${DOC} | tee "${RESULT_DIR}/${NOW}.train.log"
+"${CREDSWEEPER_DIR}"/.venv/bin/python main.py --data "${CREDDATA_DIR}" --jobs ${JOBS} ${TUNER} ${DOC} --batch_size 4096 | tee "${RESULT_DIR}/${NOW}.train.log"
 error_code=${PIPESTATUS}
 if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi
 
 cd "${CREDSWEEPER_DIR}"
 report_file=${RESULT_DIR}/${NOW}.json
-${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper ${DOC} --sort --path "${CREDDATA_DIR}/data" --log info --jobs ${JOBS}  --subtext --save-json ${report_file}
+${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper ${DOC} --sort  --rules ${CREDSWEEPER_DIR}/experiment/results/train_config.yaml --path "${CREDDATA_DIR}/data" --log info --jobs ${JOBS}  --subtext --save-json ${report_file}
 
 cd "${CREDDATA_DIR}"
 .venv/bin/python -m benchmark --scanner credsweeper --load ${report_file} | tee ${CREDSWEEPER_DIR}/.ci/benchmark.txt

diff --git a/experiment/requirements.txt b/experiment/requirements.txt
@@ -8,9 +8,9 @@ keras-tuner==1.4.7
 numpy==1.26.4
 onnx==1.17.0
 protobuf==3.20.3
-scikit-learn==1.6.0
+scikit-learn==1.6.1
 tensorflow==2.15.1
-tensorrt==10.7.0
+tensorrt==10.8.0.43
 tf2onnx==1.16.1
 wrapt==1.14.1
 

diff --git a/experiment/src/lstm_model.py b/experiment/src/lstm_model.py
@@ -1,3 +1,5 @@
+from typing import Any
+
 import keras_tuner as kt
 from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Input, Concatenate, Dropout
 from tensorflow.keras.models import Model
@@ -11,42 +13,49 @@
 class MlModel(kt.HyperModel):
     d_type = "float32"
 
-    def __init__(
-        self,
-        line_shape: tuple,
-        variable_shape: tuple,
-        value_shape: tuple,
-        feature_shape: tuple,
-    ):
+    def __init__(self, line_shape: tuple, variable_shape: tuple, value_shape: tuple, feature_shape: tuple, **kwargs):
         self.line_shape = line_shape
         self.variable_shape = variable_shape
         self.value_shape = value_shape
         self.feature_shape = feature_shape
+        self.__kwargs = kwargs
+
+    def __get_hyperparam(self, param_name: str, hp=None) -> Any:
+        if param := self.__kwargs.get(param_name):
+            if isinstance(param, float):
+                print(f"'{param_name}' constant = {param}")
+                return param
+            elif hp and isinstance(param, tuple) and 3 == len(param):
+                print(f"'{param_name}' tuning = {param}")
+                return hp.Float(param_name, min_value=param[0], max_value=param[1], step=param[2])
+            else:
+                raise ValueError(f"'{param_name}' was not inited well {param} tuner is {bool(hp)}")
+        else:
+            raise ValueError(f"'{param_name}' was not defined during init and tuner is used")
 
     def build(self, hp=None) -> Model:
         """Get keras model with string and feature input and single binary out"""
-        if hp:
-            lstm_dropout = hp.Float('dropout_lstm', min_value=0.4, max_value=0.5, step=0.01)
-            dense_dropout = hp.Float('dropout_threshold', min_value=0.3, max_value=0.4, step=0.01)
-        else:
-            # found best values
-            lstm_dropout = 0.45
-            dense_dropout = 0.35
+        value_lstm_dropout_rate = self.__get_hyperparam("value_lstm_dropout_rate", hp)
+        line_lstm_dropout_rate = self.__get_hyperparam("line_lstm_dropout_rate", hp)
+        variable_lstm_dropout_rate = self.__get_hyperparam("variable_lstm_dropout_rate", hp)
+        dense_a_dropout_rate = self.__get_hyperparam("dense_a_lstm_dropout_rate", hp)
+        dense_b_dropout_rate = self.__get_hyperparam("dense_b_lstm_dropout_rate", hp)
 
         line_input = Input(shape=(None, self.line_shape[2]), name="line_input", dtype=self.d_type)
         line_lstm = LSTM(units=self.line_shape[1], dtype=self.d_type)
         line_bidirectional = Bidirectional(layer=line_lstm, name="line_bidirectional")
-        line_lstm_branch = Dropout(lstm_dropout, name="line_dropout")(line_bidirectional(line_input))
+        line_lstm_branch = Dropout(line_lstm_dropout_rate, name="line_dropout")(line_bidirectional(line_input))
 
         variable_input = Input(shape=(None, self.variable_shape[2]), name="variable_input", dtype=self.d_type)
         variable_lstm = LSTM(units=self.variable_shape[1], dtype=self.d_type)
         variable_bidirectional = Bidirectional(layer=variable_lstm, name="variable_bidirectional")
-        variable_lstm_branch = Dropout(lstm_dropout, name="variable_dropout")(variable_bidirectional(variable_input))
+        variable_lstm_branch = Dropout(variable_lstm_dropout_rate,
+                                       name="variable_dropout")(variable_bidirectional(variable_input))
 
         value_input = Input(shape=(None, self.value_shape[2]), name="value_input", dtype=self.d_type)
         value_lstm = LSTM(units=self.value_shape[1], dtype=self.d_type)
         value_bidirectional = Bidirectional(layer=value_lstm, name="value_bidirectional")
-        value_lstm_branch = Dropout(lstm_dropout, name="value_dropout")(value_bidirectional(value_input))
+        value_lstm_branch = Dropout(value_lstm_dropout_rate, name="value_dropout")(value_bidirectional(value_input))
 
         feature_input = Input(shape=(self.feature_shape[1], ), name="feature_input", dtype=self.d_type)
 
@@ -58,11 +67,11 @@ def build(self, hp=None) -> Model:
 
         # first hidden layer
         dense_a = Dense(units=dense_units, activation='relu', name="a_dense", dtype=self.d_type)(joined_features)
-        dropout_dense_a = Dropout(dense_dropout, name="a_dropout")(dense_a)
+        dropout_dense_a = Dropout(dense_a_dropout_rate, name="a_dropout")(dense_a)
 
         # second hidden layer
         dense_b = Dense(units=dense_units, activation='relu', name="b_dense", dtype=self.d_type)(dropout_dense_a)
-        dropout_dense_b = Dropout(dense_dropout, name="b_dropout")(dense_b)
+        dropout_dense_b = Dropout(dense_b_dropout_rate, name="b_dropout")(dense_b)
 
         dense_final = Dense(units=1, activation='sigmoid', name="prediction", dtype=self.d_type)(dropout_dense_b)
 

diff --git a/experiment/src/model_config_preprocess.py b/experiment/src/model_config_preprocess.py
@@ -57,16 +57,20 @@ def model_config_preprocess(df_all: pd.DataFrame, doc_target: bool) -> Dict[str,
     data_rules_set = set(df_all["RuleName"].explode().unique())
 
     if config_rules_set != data_rules_set:
+        sorted_rules = sorted(list(data_rules_set))
+        print("Update config rule names with ", sorted_rules)
         for x in model_config["features"]:
             if "RuleName" == x["type"]:
-                x["kwargs"]["rule_names"] = sorted(list(data_rules_set))
+                x["kwargs"]["rule_names"] = sorted_rules
                 Util.json_dump(model_config, model_config_path)
                 break
         # the process must be restarted with updated config
         raise RuntimeError(f"RESTART: differences in extensions:"
                            f"\nconfig:{config_rules_set.difference(data_rules_set)}"
                            f"\ndata:{data_rules_set.difference(config_rules_set)}"
                            f"\nFile {model_config_path} was updated.")
+    else:
+        print(config_rules_set, " matches ", data_rules_set)
 
     thresholds = model_config["thresholds"]
     assert isinstance(thresholds, dict), thresholds

diff --git a/experiment/src/prepare_data.py b/experiment/src/prepare_data.py
@@ -33,12 +33,11 @@ def data_checksum(dir_path: Path) -> str:
 def prepare_train_data(cred_data_location: str, jobs: int, doc_target: bool):
     print("Start train data preparation...")
 
-    if not os.path.exists("train_config.yaml"):
-        # use pattern or keyword type
-        rules = Util.yaml_load("../credsweeper/rules/config.yaml")
-        target = "doc" if doc_target else "code"
-        new_rules = [x for x in rules if x.get("use_ml") and target in x["target"]]
-        Util.yaml_dump(new_rules, "results/train_config.yaml")
+    # use current rules
+    rules = Util.yaml_load("../credsweeper/rules/config.yaml")
+    target = "doc" if doc_target else "code"
+    new_rules = [x for x in rules if x.get("use_ml") and target in x["target"]]
+    Util.yaml_dump(new_rules, "results/train_config.yaml")
 
     meta_checksum = data_checksum(Path(cred_data_location) / "meta")
     print(f"meta checksum {meta_checksum}")