import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix import xgboost as xgb # 模拟一个简单的交易数据集 data = { 'transaction_amount': [100, 200, 150, 50, 300, 400, 120, 80], 'transaction_time': ['2025-02-05 12:00:00', '2025-02-05 13:30:00', '2025-02-05 15:00:00', '2025-02-05 16:30:00', '2025-02-06 12:00:00', '2025-02-06 13:00:00', '2025-02-06 14:30:00', '2025-02-06 16:00:00'], 'user_id': ['user1', 'user2', 'user3', 'user1', 'user2', 'user3', 'user1', 'user2'], 'device_info': ['device123', 'device124', 'device123', 'device125', 'device126', 'device124', 'device123', 'device125'], 'ip_address': ['IP123', 'IP124', 'IP125', 'IP126', 'IP127', 'IP124', 'IP123', 'IP126'], 'is_fraud': [0, 1, 0, 0, 1, 0, 0, 1] # 1 表示欺诈,0 表示正常 } # 创建 DataFrame df = pd.DataFrame(data) # 提取特征矩阵 X 和标签 y X = df[['transaction_amount', 'user_id', 'device_info', 'ip_address']] # 选择特征列 y = df['is_fraud'] # 标签是是否欺诈 # 对类别特征进行编码(如 user_id, device_info, ip_address 等) X = pd.get_dummies(X, columns=['user_id', 'device_info', 'ip_address']) # 将 transaction_time 转换为数值(如将时间转化为时间戳) X['transaction_time'] = pd.to_datetime(df['transaction_time']).view('int64') / 10**9 # 转换为 Unix 时间戳 # 拆分数据集为训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 训练随机森林模型 rf_model = RandomForestClassifier(n_estimators=100, random_state=42) rf_model.fit(X_train, y_train) # 预测测试集 y_pred_rf = rf_model.predict(X_test) # 评估模型 print("Random Forest Classification Report:") print(classification_report(y_test, y_pred_rf)) print(confusion_matrix(y_test, y_pred_rf)) # 使用 XGBoost 训练 dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) params = { 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'max_depth': 6, 'learning_rate': 0.1, 'n_estimators': 100 } # 训练模型 xgb_model = xgb.train(params, dtrain, num_boost_round=100) # 预测 y_pred_xgb = xgb_model.predict(dtest) y_pred_binary_xgb = [1 if p > 0.5 else 0 for p in y_pred_xgb] # 评估模型 print("XGBoost Classification Report:") print(classification_report(y_test, y_pred_binary_xgb)) print(confusion_matrix(y_test, y_pred_binary_xgb)) # 新的交易数据 new_transaction = [[100, 'user1', 'device123', 'IP123', 1644052800]] # 使用 Unix 时间戳 new_transaction = pd.get_dummies(pd.DataFrame(new_transaction, columns=['transaction_amount', 'user_id', 'device_info', 'ip_address', 'transaction_time'])) # 预测新交易是否为欺诈 predicted_label_rf = rf_model.predict(new_transaction) predicted_label_xgb = xgb_model.predict(xgb.DMatrix(new_transaction)) # 输出预测结果 if predicted_label_rf == 1: print("随机森林模型预测:交易存在欺诈风险!") else: print("随机森林模型预测:交易正常。") if predicted_label_xgb > 0.5: print("XGBoost模型预测:交易存在欺诈风险!") else: print("XGBoost模型预测:交易正常。")