Уменьшение выборки предполагает сокращение количества многочисленных классов до обучения модели.
Задача 1
Чтобы выполнить downsampling, напишите функцию downsample() с тремя аргументами:
- features — признаки;
- target — целевой признак;
- fraction — доля отрицательных объектов, которые нужно сохранить.
Функция вернёт признаки и целевой признак после операции downsampling. Вызовите функцию для обучающих данных с аргументом fraction, равным 0.1. Код выведет на экран размеры выборок.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
data = pd.read_csv('/datasets/travel_insurance_preprocessed.csv')
target = data['Claim']
features = data.drop('Claim', axis=1)
features_train, features_valid, target_train, target_valid = train_test_split(
features, target, test_size=0.25, random_state=12345)
def downsample(features, target, fraction):
features_zeros = features[target == 0]
features_ones = features[target == 1]
target_zeros = target[target == 0]
target_ones = target[target == 1]
features_downsampled=pd.concat([features_zeros.sample(frac=fraction, random_state=12345)] + [features_ones])
target_downsampled=pd.concat([target_zeros.sample(frac=fraction, random_state=12345)]+[target_ones])# < напишите код здесь >
features_downsampled, target_downsampled = shuffle(features_downsampled, target_downsampled, random_state=12345)
return features_downsampled, target_downsampled
features_downsampled, target_downsampled = downsample(features_train, target_train, 0.1)
print(features_downsampled.shape)
print(target_downsampled.shape)
Задача 2
Обучите на новых данных модель LogisticRegression. Найдите для неё значение F1-меры, и код выведет его на экран.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
data = pd.read_csv('/datasets/travel_insurance_preprocessed.csv')
target = data['Claim']
features = data.drop('Claim', axis=1)
features_train, features_valid, target_train, target_valid = train_test_split(
features, target, test_size=0.25, random_state=12345)
def downsample(features, target, fraction):
features_zeros = features[target == 0]
features_ones = features[target == 1]
target_zeros = target[target == 0]
target_ones = target[target == 1]
features_downsampled = pd.concat(
[features_zeros.sample(frac=fraction, random_state=12345)] + [features_ones])
target_downsampled = pd.concat(
[target_zeros.sample(frac=fraction, random_state=12345)] + [target_ones])
features_downsampled, target_downsampled = shuffle(
features_downsampled, target_downsampled, random_state=12345)
return features_downsampled, target_downsampled
features_downsampled, target_downsampled = downsample(features_train, target_train, 0.1)
model=LogisticRegression(random_state=12345, solver='liblinear')
model.fit(features_downsampled, target_downsampled)
predicted_valid=model.predict(features_valid)
print("F1:", f1_score(target_valid, predicted_valid))