fraud-detection-ml/demo.py
2025-02-14 11:18:13 +08:00

90 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
# 模拟一个简单的交易数据集
data = {
'transaction_amount': [100, 200, 150, 50, 300, 400, 120, 80],
'transaction_time': ['2025-02-05 12:00:00', '2025-02-05 13:30:00', '2025-02-05 15:00:00',
'2025-02-05 16:30:00', '2025-02-06 12:00:00', '2025-02-06 13:00:00',
'2025-02-06 14:30:00', '2025-02-06 16:00:00'],
'user_id': ['user1', 'user2', 'user3', 'user1', 'user2', 'user3', 'user1', 'user2'],
'device_info': ['device123', 'device124', 'device123', 'device125', 'device126', 'device124', 'device123', 'device125'],
'ip_address': ['IP123', 'IP124', 'IP125', 'IP126', 'IP127', 'IP124', 'IP123', 'IP126'],
'is_fraud': [0, 1, 0, 0, 1, 0, 0, 1] # 1 表示欺诈0 表示正常
}
# 创建 DataFrame
df = pd.DataFrame(data)
# 提取特征矩阵 X 和标签 y
X = df[['transaction_amount', 'user_id', 'device_info', 'ip_address']] # 选择特征列
y = df['is_fraud'] # 标签是是否欺诈
# 对类别特征进行编码(如 user_id, device_info, ip_address 等)
X = pd.get_dummies(X, columns=['user_id', 'device_info', 'ip_address'])
# 将 transaction_time 转换为数值(如将时间转化为时间戳)
X['transaction_time'] = pd.to_datetime(df['transaction_time']).view('int64') / 10**9 # 转换为 Unix 时间戳
# 拆分数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 训练随机森林模型
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
# 预测测试集
y_pred_rf = rf_model.predict(X_test)
# 评估模型
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))
# 使用 XGBoost 训练
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
params = {
'objective': 'binary:logistic',
'eval_metric': 'logloss',
'max_depth': 6,
'learning_rate': 0.1,
'n_estimators': 100
}
# 训练模型
xgb_model = xgb.train(params, dtrain, num_boost_round=100)
# 预测
y_pred_xgb = xgb_model.predict(dtest)
y_pred_binary_xgb = [1 if p > 0.5 else 0 for p in y_pred_xgb]
# 评估模型
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_binary_xgb))
print(confusion_matrix(y_test, y_pred_binary_xgb))
# 新的交易数据
new_transaction = [[100, 'user1', 'device123', 'IP123', 1644052800]] # 使用 Unix 时间戳
new_transaction = pd.get_dummies(pd.DataFrame(new_transaction, columns=['transaction_amount', 'user_id', 'device_info', 'ip_address', 'transaction_time']))
# 预测新交易是否为欺诈
predicted_label_rf = rf_model.predict(new_transaction)
predicted_label_xgb = xgb_model.predict(xgb.DMatrix(new_transaction))
# 输出预测结果
if predicted_label_rf == 1:
print("随机森林模型预测:交易存在欺诈风险!")
else:
print("随机森林模型预测:交易正常。")
if predicted_label_xgb > 0.5:
print("XGBoost模型预测交易存在欺诈风险")
else:
print("XGBoost模型预测交易正常。")