Skip to content

Commit

Permalink
Experiment sync (#680)
Browse files Browse the repository at this point in the history
* sync

* rollback
  • Loading branch information
babenek authored Feb 17, 2025
1 parent 1655d3b commit ff602e0
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 39 deletions.
40 changes: 31 additions & 9 deletions experiment/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,10 +142,22 @@ def main(cred_data_location: str,

print(f"Memory before search / compile: {LogCallback.get_memory_info()}")

hp_dict = {
"value_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.41),
"line_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.41),
"variable_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.46),
"dense_a_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.2),
"dense_b_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.18),
}
log_callback = LogCallback()
if use_tuner:
tuner = kt.GridSearch(
hypermodel=MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape),
print(f"Tuner initial dict:{hp_dict}")
tuner_kwargs = {k: v[0] for k, v in hp_dict.items()}
print(f"Tuner kwargs:{tuner_kwargs}")

tuner = kt.BayesianOptimization(
hypermodel=MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape,
**tuner_kwargs),
objective='val_loss',
directory=str(dir_path / f"{current_time}.tuner"),
project_name='ml_tuning',
Expand All @@ -167,11 +179,17 @@ def main(cred_data_location: str,
print("Best Hyperparameters:")
for k, v in tuner.get_best_hyperparameters()[0].values.items():
print(f"{k}: {v}")
keras_model = tuner.get_best_models()[0]
param_kwargs = {k: float(v) for k, v in tuner.get_best_hyperparameters()[0].values.items() if k in hp_dict}
del tuner
else:
keras_model = MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape,
x_full_features.shape).build()
print(f"Model is trained with params from dict:{hp_dict}")
param_kwargs = {k: v[1] for k, v in hp_dict.items()}

print(f"Model hyper parameters: {param_kwargs}")

# repeat train step to obtain actual history chart
keras_model = MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape,
**param_kwargs).build()

early_stopping = EarlyStopping(monitor="val_loss",
patience=patience,
Expand All @@ -197,6 +215,9 @@ def main(cred_data_location: str,
callbacks=[early_stopping, model_checkpoint, log_callback],
use_multiprocessing=True)

# if best_val_loss is not None and best_val_loss + 0.00001 < early_stopping.best:
# print(f"CHECK BEST TUNER EARLY STOP : {best_val_loss} vs CURRENT: {early_stopping.best}")

print(f"Memory after train: {LogCallback.get_memory_info()}")

with open(dir_path / f"{current_time}.history.pickle", "wb") as f:
Expand Down Expand Up @@ -231,9 +252,9 @@ def main(cred_data_location: str,

onnx_model_file = pathlib.Path(__file__).parent.parent / "credsweeper" / "ml_model" / "ml_model.onnx"
# convert the model to onnx right now
command = f"{sys.executable} -m tf2onnx.convert --saved-model {model_file_name.absolute()}" \
f" --output {str(onnx_model_file)} --verbose"
subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)
convert_args = f"{sys.executable} -m tf2onnx.convert --saved-model {model_file_name.absolute()}" \
f" --output {str(onnx_model_file)} --verbose"
subprocess.check_call(convert_args, shell=True, cwd=pathlib.Path(__file__).parent)
with open(onnx_model_file, "rb") as f:
onnx_md5 = hashlib.md5(f.read()).hexdigest()
print(f"ml_model.onnx:{onnx_md5}")
Expand Down Expand Up @@ -294,7 +315,7 @@ def main(cred_data_location: str,
parser.add_argument("--tuner", help="use keras tuner", dest="use_tuner", action="store_true")
args = parser.parse_args()

fixed_seed = 20250117
fixed_seed = 20250124
print(f"Fixed seed:{fixed_seed}")
tf.random.set_seed(fixed_seed)
np.random.seed(fixed_seed)
Expand All @@ -306,6 +327,7 @@ def main(cred_data_location: str,
command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_model.onnx"
subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)

print(args) # dbg
_model_file_name = main(cred_data_location=args.cred_data_location,
jobs=int(args.jobs),
epochs=int(args.epochs),
Expand Down
6 changes: 4 additions & 2 deletions experiment/main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ export PYTHONPATH="${CREDSWEEPER_DIR}":$PYTHONPATH

# check whether current version
"${CREDSWEEPER_DIR}"/.venv/bin/python -m credsweeper --banner
git log -1
git status

WORK_DIR="${CREDSWEEPER_DIR}/experiment"
cd "${WORK_DIR}"
Expand All @@ -43,13 +45,13 @@ mkdir -vp "${RESULT_DIR}"
#TUNER=--tuner
# set env DOC to apply doc dataset
#DOC=--doc
"${CREDSWEEPER_DIR}"/.venv/bin/python main.py --data "${CREDDATA_DIR}" --jobs ${JOBS} ${TUNER} ${DOC} | tee "${RESULT_DIR}/${NOW}.train.log"
"${CREDSWEEPER_DIR}"/.venv/bin/python main.py --data "${CREDDATA_DIR}" --jobs ${JOBS} ${TUNER} ${DOC} --batch_size 4096 | tee "${RESULT_DIR}/${NOW}.train.log"
error_code=${PIPESTATUS}
if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi

cd "${CREDSWEEPER_DIR}"
report_file=${RESULT_DIR}/${NOW}.json
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper ${DOC} --sort --path "${CREDDATA_DIR}/data" --log info --jobs ${JOBS} --subtext --save-json ${report_file}
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper ${DOC} --sort --rules ${CREDSWEEPER_DIR}/experiment/results/train_config.yaml --path "${CREDDATA_DIR}/data" --log info --jobs ${JOBS} --subtext --save-json ${report_file}

cd "${CREDDATA_DIR}"
.venv/bin/python -m benchmark --scanner credsweeper --load ${report_file} | tee ${CREDSWEEPER_DIR}/.ci/benchmark.txt
Expand Down
4 changes: 2 additions & 2 deletions experiment/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ keras-tuner==1.4.7
numpy==1.26.4
onnx==1.17.0
protobuf==3.20.3
scikit-learn==1.6.0
scikit-learn==1.6.1
tensorflow==2.15.1
tensorrt==10.7.0
tensorrt==10.8.0.43
tf2onnx==1.16.1
wrapt==1.14.1

Expand Down
47 changes: 28 additions & 19 deletions experiment/src/lstm_model.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Any

import keras_tuner as kt
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Input, Concatenate, Dropout
from tensorflow.keras.models import Model
Expand All @@ -11,42 +13,49 @@
class MlModel(kt.HyperModel):
d_type = "float32"

def __init__(
self,
line_shape: tuple,
variable_shape: tuple,
value_shape: tuple,
feature_shape: tuple,
):
def __init__(self, line_shape: tuple, variable_shape: tuple, value_shape: tuple, feature_shape: tuple, **kwargs):
self.line_shape = line_shape
self.variable_shape = variable_shape
self.value_shape = value_shape
self.feature_shape = feature_shape
self.__kwargs = kwargs

def __get_hyperparam(self, param_name: str, hp=None) -> Any:
if param := self.__kwargs.get(param_name):
if isinstance(param, float):
print(f"'{param_name}' constant = {param}")
return param
elif hp and isinstance(param, tuple) and 3 == len(param):
print(f"'{param_name}' tuning = {param}")
return hp.Float(param_name, min_value=param[0], max_value=param[1], step=param[2])
else:
raise ValueError(f"'{param_name}' was not inited well {param} tuner is {bool(hp)}")
else:
raise ValueError(f"'{param_name}' was not defined during init and tuner is used")

def build(self, hp=None) -> Model:
"""Get keras model with string and feature input and single binary out"""
if hp:
lstm_dropout = hp.Float('dropout_lstm', min_value=0.4, max_value=0.5, step=0.01)
dense_dropout = hp.Float('dropout_threshold', min_value=0.3, max_value=0.4, step=0.01)
else:
# found best values
lstm_dropout = 0.45
dense_dropout = 0.35
value_lstm_dropout_rate = self.__get_hyperparam("value_lstm_dropout_rate", hp)
line_lstm_dropout_rate = self.__get_hyperparam("line_lstm_dropout_rate", hp)
variable_lstm_dropout_rate = self.__get_hyperparam("variable_lstm_dropout_rate", hp)
dense_a_dropout_rate = self.__get_hyperparam("dense_a_lstm_dropout_rate", hp)
dense_b_dropout_rate = self.__get_hyperparam("dense_b_lstm_dropout_rate", hp)

line_input = Input(shape=(None, self.line_shape[2]), name="line_input", dtype=self.d_type)
line_lstm = LSTM(units=self.line_shape[1], dtype=self.d_type)
line_bidirectional = Bidirectional(layer=line_lstm, name="line_bidirectional")
line_lstm_branch = Dropout(lstm_dropout, name="line_dropout")(line_bidirectional(line_input))
line_lstm_branch = Dropout(line_lstm_dropout_rate, name="line_dropout")(line_bidirectional(line_input))

variable_input = Input(shape=(None, self.variable_shape[2]), name="variable_input", dtype=self.d_type)
variable_lstm = LSTM(units=self.variable_shape[1], dtype=self.d_type)
variable_bidirectional = Bidirectional(layer=variable_lstm, name="variable_bidirectional")
variable_lstm_branch = Dropout(lstm_dropout, name="variable_dropout")(variable_bidirectional(variable_input))
variable_lstm_branch = Dropout(variable_lstm_dropout_rate,
name="variable_dropout")(variable_bidirectional(variable_input))

value_input = Input(shape=(None, self.value_shape[2]), name="value_input", dtype=self.d_type)
value_lstm = LSTM(units=self.value_shape[1], dtype=self.d_type)
value_bidirectional = Bidirectional(layer=value_lstm, name="value_bidirectional")
value_lstm_branch = Dropout(lstm_dropout, name="value_dropout")(value_bidirectional(value_input))
value_lstm_branch = Dropout(value_lstm_dropout_rate, name="value_dropout")(value_bidirectional(value_input))

feature_input = Input(shape=(self.feature_shape[1], ), name="feature_input", dtype=self.d_type)

Expand All @@ -58,11 +67,11 @@ def build(self, hp=None) -> Model:

# first hidden layer
dense_a = Dense(units=dense_units, activation='relu', name="a_dense", dtype=self.d_type)(joined_features)
dropout_dense_a = Dropout(dense_dropout, name="a_dropout")(dense_a)
dropout_dense_a = Dropout(dense_a_dropout_rate, name="a_dropout")(dense_a)

# second hidden layer
dense_b = Dense(units=dense_units, activation='relu', name="b_dense", dtype=self.d_type)(dropout_dense_a)
dropout_dense_b = Dropout(dense_dropout, name="b_dropout")(dense_b)
dropout_dense_b = Dropout(dense_b_dropout_rate, name="b_dropout")(dense_b)

dense_final = Dense(units=1, activation='sigmoid', name="prediction", dtype=self.d_type)(dropout_dense_b)

Expand Down
6 changes: 5 additions & 1 deletion experiment/src/model_config_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,20 @@ def model_config_preprocess(df_all: pd.DataFrame, doc_target: bool) -> Dict[str,
data_rules_set = set(df_all["RuleName"].explode().unique())

if config_rules_set != data_rules_set:
sorted_rules = sorted(list(data_rules_set))
print("Update config rule names with ", sorted_rules)
for x in model_config["features"]:
if "RuleName" == x["type"]:
x["kwargs"]["rule_names"] = sorted(list(data_rules_set))
x["kwargs"]["rule_names"] = sorted_rules
Util.json_dump(model_config, model_config_path)
break
# the process must be restarted with updated config
raise RuntimeError(f"RESTART: differences in extensions:"
f"\nconfig:{config_rules_set.difference(data_rules_set)}"
f"\ndata:{data_rules_set.difference(config_rules_set)}"
f"\nFile {model_config_path} was updated.")
else:
print(config_rules_set, " matches ", data_rules_set)

thresholds = model_config["thresholds"]
assert isinstance(thresholds, dict), thresholds
Expand Down
11 changes: 5 additions & 6 deletions experiment/src/prepare_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,11 @@ def data_checksum(dir_path: Path) -> str:
def prepare_train_data(cred_data_location: str, jobs: int, doc_target: bool):
print("Start train data preparation...")

if not os.path.exists("train_config.yaml"):
# use pattern or keyword type
rules = Util.yaml_load("../credsweeper/rules/config.yaml")
target = "doc" if doc_target else "code"
new_rules = [x for x in rules if x.get("use_ml") and target in x["target"]]
Util.yaml_dump(new_rules, "results/train_config.yaml")
# use current rules
rules = Util.yaml_load("../credsweeper/rules/config.yaml")
target = "doc" if doc_target else "code"
new_rules = [x for x in rules if x.get("use_ml") and target in x["target"]]
Util.yaml_dump(new_rules, "results/train_config.yaml")

meta_checksum = data_checksum(Path(cred_data_location) / "meta")
print(f"meta checksum {meta_checksum}")
Expand Down

0 comments on commit ff602e0

Please sign in to comment.