cspj-application/server-ml/training.py

import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

# random data
data = [
    ("' OR '1'='1", 1),
    ("SELECT * FROM users WHERE id=1", 1),
    ("DROP TABLE users;", 1),
    ("username=admin'--", 1),
    ("hello world", 0),
    ("this is a normal query", 0),
    ("select data from table", 0),
    ("just another harmless input", 0),
]

queries, labels = zip(*data)

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    queries, labels, test_size=0.2, random_state=42
)

# build a pipeline with a vectorizer and a logistic regression model
pipeline = make_pipeline(CountVectorizer(), LogisticRegression())

# train the model
pipeline.fit(X_train, y_train)

# save the model to a file
joblib.dump(pipeline, "model.pkl")

print("Model trained and saved to model.pkl")