90 lines
3.3 KiB
Python
90 lines
3.3 KiB
Python
import pandas as pd
|
||
import numpy as np
|
||
from sklearn.model_selection import train_test_split
|
||
from sklearn.ensemble import RandomForestClassifier
|
||
from sklearn.metrics import classification_report, confusion_matrix
|
||
import xgboost as xgb
|
||
|
||
# 模拟一个简单的交易数据集
|
||
data = {
|
||
'transaction_amount': [100, 200, 150, 50, 300, 400, 120, 80],
|
||
'transaction_time': ['2025-02-05 12:00:00', '2025-02-05 13:30:00', '2025-02-05 15:00:00',
|
||
'2025-02-05 16:30:00', '2025-02-06 12:00:00', '2025-02-06 13:00:00',
|
||
'2025-02-06 14:30:00', '2025-02-06 16:00:00'],
|
||
'user_id': ['user1', 'user2', 'user3', 'user1', 'user2', 'user3', 'user1', 'user2'],
|
||
'device_info': ['device123', 'device124', 'device123', 'device125', 'device126', 'device124', 'device123', 'device125'],
|
||
'ip_address': ['IP123', 'IP124', 'IP125', 'IP126', 'IP127', 'IP124', 'IP123', 'IP126'],
|
||
'is_fraud': [0, 1, 0, 0, 1, 0, 0, 1] # 1 表示欺诈,0 表示正常
|
||
}
|
||
|
||
# 创建 DataFrame
|
||
df = pd.DataFrame(data)
|
||
|
||
# 提取特征矩阵 X 和标签 y
|
||
X = df[['transaction_amount', 'user_id', 'device_info', 'ip_address']] # 选择特征列
|
||
y = df['is_fraud'] # 标签是是否欺诈
|
||
|
||
# 对类别特征进行编码(如 user_id, device_info, ip_address 等)
|
||
X = pd.get_dummies(X, columns=['user_id', 'device_info', 'ip_address'])
|
||
|
||
# 将 transaction_time 转换为数值(如将时间转化为时间戳)
|
||
X['transaction_time'] = pd.to_datetime(df['transaction_time']).view('int64') / 10**9 # 转换为 Unix 时间戳
|
||
|
||
# 拆分数据集为训练集和测试集
|
||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
||
|
||
# 训练随机森林模型
|
||
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
|
||
rf_model.fit(X_train, y_train)
|
||
|
||
# 预测测试集
|
||
y_pred_rf = rf_model.predict(X_test)
|
||
|
||
# 评估模型
|
||
print("Random Forest Classification Report:")
|
||
print(classification_report(y_test, y_pred_rf))
|
||
print(confusion_matrix(y_test, y_pred_rf))
|
||
|
||
# 使用 XGBoost 训练
|
||
dtrain = xgb.DMatrix(X_train, label=y_train)
|
||
dtest = xgb.DMatrix(X_test, label=y_test)
|
||
|
||
params = {
|
||
'objective': 'binary:logistic',
|
||
'eval_metric': 'logloss',
|
||
'max_depth': 6,
|
||
'learning_rate': 0.1,
|
||
'n_estimators': 100
|
||
}
|
||
|
||
# 训练模型
|
||
xgb_model = xgb.train(params, dtrain, num_boost_round=100)
|
||
|
||
# 预测
|
||
y_pred_xgb = xgb_model.predict(dtest)
|
||
y_pred_binary_xgb = [1 if p > 0.5 else 0 for p in y_pred_xgb]
|
||
|
||
# 评估模型
|
||
print("XGBoost Classification Report:")
|
||
print(classification_report(y_test, y_pred_binary_xgb))
|
||
print(confusion_matrix(y_test, y_pred_binary_xgb))
|
||
|
||
# 新的交易数据
|
||
new_transaction = [[100, 'user1', 'device123', 'IP123', 1644052800]] # 使用 Unix 时间戳
|
||
new_transaction = pd.get_dummies(pd.DataFrame(new_transaction, columns=['transaction_amount', 'user_id', 'device_info', 'ip_address', 'transaction_time']))
|
||
|
||
# 预测新交易是否为欺诈
|
||
predicted_label_rf = rf_model.predict(new_transaction)
|
||
predicted_label_xgb = xgb_model.predict(xgb.DMatrix(new_transaction))
|
||
|
||
# 输出预测结果
|
||
if predicted_label_rf == 1:
|
||
print("随机森林模型预测:交易存在欺诈风险!")
|
||
else:
|
||
print("随机森林模型预测:交易正常。")
|
||
|
||
if predicted_label_xgb > 0.5:
|
||
print("XGBoost模型预测:交易存在欺诈风险!")
|
||
else:
|
||
print("XGBoost模型预测:交易正常。")
|