机器学习之 基于xgboost的特征筛选

发布时间:2024-12-26 11:46

深度学习自编码器:用于特征提取和降维 #生活技巧# #学习技巧# #深度学习技巧#

本文主要是基于xgboost进行特征选择,很多人都知道在后面的模型选择时,xgboost模型是一个非常热门的模型。但其实在前面特征选择部分,基于xgboost进行特征筛选也大有可为。

import pandas as pd

import xgboost as xgb

import os,random,pickle

os.mkdir('featurescore')

train = pd.read_csv('../../data/train/train_x_rank.csv')

train_target = pd.read_csv('../../data/train/train_master.csv',encoding='gb18030')[['Idx','target']]

train = pd.merge(train,train_target,on='Idx')

train_y = train.target

train_x = train.drop(['Idx','target'],axis=1)

dtrain = xgb.DMatrix(train_x, label=train_y)

test = pd.read_csv('../../data/test/test_x_rank.csv')

test_Idx = test.Idx

test = test.drop('Idx',axis=1)

dtest = xgb.DMatrix(test)

train_test = pd.concat([train,test])

train_test.to_csv('rank_feature.csv',index=None)

print print(train_test.shape)

"""

params={

'booster':'gbtree',

'objective': 'rank:pairwise',

'scale_pos_weight': float(len(train_y)-sum(train_y))/float(sum(train_y)),

'eval_metric': 'auc',

'gamma':0.1,

'max_depth':6,

'lambda':500,

'subsample':0.6,

'colsample_bytree':0.3,

'min_child_weight':0.2,

'eta': 0.04,

'seed':1024,

'nthread':8

}

xgb.cv(params,dtrain,num_boost_round=1100,nfold=10,metrics='auc',show_progress=3,seed=1024)#733

"""

def pipeline(iteration,random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight):

params={

'booster':'gbtree',

'objective': 'rank:pairwise',

'scale_pos_weight': float(len(train_y)-sum(train_y))/float(sum(train_y)),

'eval_metric': 'auc',

'gamma':gamma,

'max_depth':max_depth,

'lambda':lambd,

'subsample':subsample,

'colsample_bytree':colsample_bytree,

'min_child_weight':min_child_weight,

'eta': 0.2,

'seed':random_seed,

'nthread':8

}

watchlist = [(dtrain,'train')]

model = xgb.train(params,dtrain,num_boost_round=700,evals=watchlist)

feature_score = model.get_fscore()

feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True)

fs = []

for (key,value) in feature_score:

fs.append("{0},{1}\n".format(key,value))

with open('./featurescore/feature_score_{0}.csv'.format(iteration),'w') as f:

f.writelines("feature,score\n")

f.writelines(fs)

if __name__ == "__main__":

random_seed = range(10000,20000,100)

gamma = [i/1000.0 for i in range(0,300,3)]

max_depth = [5,6,7]

lambd = range(400,600,2)

subsample = [i/1000.0 for i in range(500,700,2)]

colsample_bytree = [i/1000.0 for i in range(550,750,4)]

min_child_weight = [i/1000.0 for i in range(250,550,3)]

random.shuffle(random_seed)

random.shuffle(gamma)

random.shuffle(max_depth)

random.shuffle(lambd)

random.shuffle(subsample)

random.shuffle(colsample_bytree)

random.shuffle(min_child_weight)

with open('params.pkl','w') as f:

pickle.dump((random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight),f)

for i in range(36):

pipeline(i,random_seed[i],gamma[i],max_depth[i%3],lambd[i],subsample[i],colsample_bytree[i],min_child_weight[i])

 因为xgboost的参数选择非常重要,因此进行了参数shuffle的操作。最后可以基于以上不同参数组合的xgboost所得到的feature和socre,再进行score平均操作,筛选出高得分的特征。

import pandas as pd

import os

files = os.listdir('featurescore')

fs = {}

for f in files:

t = pd.read_csv('featurescore/'+f)

t.index = t.feature

t = t.drop(['feature'],axis=1)

d = t.to_dict()['score']

for key in d:

if fs.has_key(key):

fs[key] += d[key]

else:

fs[key] = d[key]

fs = sorted(fs.items(), key=lambda x:x[1],reverse=True)

t = []

for (key,value) in fs:

t.append("{0},{1}\n".format(key,value))

with open('rank_feature_score.csv','w') as f:

f.writelines("feature,score\n")

f.writelines(t)

 这里得出了每个特征的总分,每个都除以36就是平均分了。最后按照平均分取出topn就可以。我的理解是这样子。

然后觉得这种方法太耗时了。

网址:机器学习之 基于xgboost的特征筛选 https://www.yuejiaxmz.com/news/view/574195

相关内容

机器学习: LightGBM模型(优化版)——高效且强大的树形模型
基于机器学习的健康风险评估与预测
机器学习建模神器PyCaret已开源!提升效率,几行代码轻松搞定模型
基于机器学习的服装搭配问题分析
8 种最受欢迎​​的机器学习工具
机器学习(一)——特征工程
毕业设计:基于机器学习的生活垃圾智能分类系统
机器学习降维:删除低方差特征与相关系数
润和软件申请基于PCC-XGBoost光伏系统清洗策略生成方法及系统专利,利用相似性原理获取与当前日期气象条件最为相似的历史发电数据,从而保证决策模型的准确性
基于多维度聚类算法的重庆住宅空调使用特征分析

随便看看