159 lines
5.4 KiB
Python
159 lines
5.4 KiB
Python
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
from sklearn.linear_model import LogisticRegression # 逻辑回归
|
|
from sklearn.tree import DecisionTreeClassifier # 决策树
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
|
import joblib
|
|
|
|
# pandas的显示设置来增加可以显示的列数
|
|
pd.set_option('display.max_columns', None)
|
|
# 读取数据
|
|
data = pd.read_csv('creditcard.csv')
|
|
# 查看默认的前5行数据
|
|
data.head(5)
|
|
|
|
# 查看数据的信息
|
|
print(data.shape)
|
|
data.info()
|
|
# 查看数据的描述
|
|
data.describe()
|
|
# 检查是否有空置
|
|
data.isnull().sum()
|
|
# 查看没类的个数
|
|
data['Class'].value_counts()
|
|
# 划分特征和标签
|
|
X = data.drop('Class', axis=1)
|
|
y = data['Class']
|
|
# 划分训练集和测试集
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
# 特征标准化
|
|
scaler = StandardScaler()
|
|
X_train = scaler.fit_transform(X_train)
|
|
X_test = scaler.transform(X_test)
|
|
print("原始数据训练-----")
|
|
# 使用逻辑回归算法进行训练和评估
|
|
print("\nLogistic Regression:")
|
|
# 初始化逻辑回归模型
|
|
model = LogisticRegression(max_iter=1000, random_state=42)
|
|
model.fit(X_train, y_train)
|
|
y_pred = model.predict(X_test)
|
|
# 打印性能指标
|
|
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
|
|
print(f'Precision: {precision_score(y_test, y_pred)}')
|
|
print(f'Recall: {recall_score(y_test, y_pred)}')
|
|
print(f'F1 Score: {f1_score(y_test, y_pred)}')
|
|
|
|
print("\nDecisionTreeClassifier:")
|
|
# 初始化决策树模型
|
|
model = DecisionTreeClassifier()
|
|
model.fit(X_train, y_train)
|
|
y_pred = model.predict(X_test)
|
|
# 打印性能指标
|
|
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
|
|
print(f'Precision: {precision_score(y_test, y_pred)}')
|
|
print(f'Recall: {recall_score(y_test, y_pred)}')
|
|
print(f'F1 Score: {f1_score(y_test, y_pred)}')
|
|
|
|
from imblearn.under_sampling import RandomUnderSampler
|
|
|
|
print("下采样以平衡数据")
|
|
rus = RandomUnderSampler(random_state=42)
|
|
X_res, y_res = rus.fit_resample(X, y)
|
|
|
|
# 分割数据为训练集和测试集
|
|
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
|
|
|
|
# 特征缩放
|
|
scaler = StandardScaler()
|
|
X_train = scaler.fit_transform(X_train)
|
|
X_test = scaler.transform(X_test)
|
|
|
|
# 使用逻辑回归算法进行训练和评估
|
|
print("\nLogistic Regression:")
|
|
# 初始化逻辑回归模型
|
|
model = LogisticRegression(max_iter=1000, random_state=42)
|
|
model.fit(X_train, y_train)
|
|
y_pred = model.predict(X_test)
|
|
# 打印性能指标
|
|
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
|
|
print(f'Precision: {precision_score(y_test, y_pred)}')
|
|
print(f'Recall: {recall_score(y_test, y_pred)}')
|
|
print(f'F1 Score: {f1_score(y_test, y_pred)}')
|
|
|
|
print("\nDecisionTreeClassifier:")
|
|
# 初始化决策树模型
|
|
model = DecisionTreeClassifier()
|
|
model.fit(X_train, y_train)
|
|
y_pred = model.predict(X_test)
|
|
# 打印性能指标
|
|
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
|
|
print(f'Precision: {precision_score(y_test, y_pred)}')
|
|
print(f'Recall: {recall_score(y_test, y_pred)}')
|
|
print(f'F1 Score: {f1_score(y_test, y_pred)}')
|
|
|
|
from imblearn.over_sampling import SMOTE
|
|
|
|
print("上采样以平衡数据")
|
|
smote = SMOTE(random_state=42)
|
|
X_res, y_res = smote.fit_resample(X, y)
|
|
# 分割数据为训练集和测试集
|
|
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res,
|
|
test_size=0.2, random_state=42)
|
|
# 特征缩放
|
|
scaler = StandardScaler()
|
|
X_train = scaler.fit_transform(X_train)
|
|
X_test = scaler.transform(X_test)
|
|
|
|
# 模型调优
|
|
print("\nDecisionTreeClassifier:")
|
|
# 初始化决策树模型
|
|
model = DecisionTreeClassifier()
|
|
model.fit(X_train, y_train)
|
|
y_pred = model.predict(X_test)
|
|
# 打印性能指标
|
|
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
|
|
print(f'Precision: {precision_score(y_test, y_pred)}')
|
|
print(f'Recall: {recall_score(y_test, y_pred)}')
|
|
print(f'F1 Score: {f1_score(y_test, y_pred)}')
|
|
|
|
# 使用逻辑回归算法进行训练和评估
|
|
print("\nLogistic Regression:")
|
|
# 初始化逻辑回归模型
|
|
model = LogisticRegression(max_iter=1000, random_state=42)
|
|
model.fit(X_train, y_train)
|
|
y_pred = model.predict(X_test)
|
|
# 打印性能指标
|
|
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
|
|
print(f'Precision: {precision_score(y_test, y_pred)}')
|
|
print(f'Recall: {recall_score(y_test, y_pred)}')
|
|
print(f'F1 Score: {f1_score(y_test, y_pred)}')
|
|
from sklearn.model_selection import learning_curve
|
|
# 获取学习曲线数据
|
|
train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=5, n_jobs=-1,
|
|
train_sizes=np.linspace(0.1, 1.0, 5))
|
|
# 计算训练和测试分数的平均值与标准差
|
|
train_scores_mean = np.mean(train_scores, axis=1)
|
|
train_scores_std = np.std(train_scores, axis=1)
|
|
test_scores_mean = np.mean(test_scores, axis=1)
|
|
test_scores_std = np.std(test_scores, axis=1)
|
|
|
|
# 绘制学习曲线
|
|
plt.figure()
|
|
plt.title("Learning Curve")
|
|
plt.xlabel("Training Examples")
|
|
plt.ylabel("Score")
|
|
plt.grid()
|
|
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training Score")
|
|
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation Score")
|
|
|
|
plt.legend(loc="best")
|
|
plt.show()
|
|
|
|
# 加载逻辑回归模型
|
|
logistic_model = joblib.load('logistic_regression_model.pkl')
|
|
# 加载决策树模型
|
|
decision_tree_model = joblib.load('decision_tree_model.pkl')
|