cspj-application/server-ml/training.py
2024-12-02 20:45:50 +08:00

36 lines
987 B
Python

import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
# random data
data = [
("' OR '1'='1", 1),
("SELECT * FROM users WHERE id=1", 1),
("DROP TABLE users;", 1),
("username=admin'--", 1),
("hello world", 0),
("this is a normal query", 0),
("select data from table", 0),
("just another harmless input", 0),
]
queries, labels = zip(*data)
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
queries, labels, test_size=0.2, random_state=42
)
# build a pipeline with a vectorizer and a logistic regression model
pipeline = make_pipeline(CountVectorizer(), LogisticRegression())
# train the model
pipeline.fit(X_train, y_train)
# save the model to a file
joblib.dump(pipeline, "model.pkl")
print("Model trained and saved to model.pkl")