import numpy as np
import pandas as pd
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df
ID | sex | length | diameter | height | whole_weight | shucked_weight | viscera_weight | shell_weight | rings | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1721 | M | 0.655 | 0.550 | 0.180 | 1.2740 | 0.5860 | 0.2810 | 0.3650 | 10 |
1 | 3757 | I | 0.520 | 0.410 | 0.140 | 0.6990 | 0.3395 | 0.1290 | 0.1945 | 10 |
2 | 3723 | I | 0.470 | 0.355 | 0.120 | 0.4915 | 0.1765 | 0.1125 | 0.1325 | 9 |
3 | 2005 | I | 0.395 | 0.290 | 0.095 | 0.3000 | 0.1580 | 0.0680 | 0.0780 | 7 |
4 | 1279 | I | 0.495 | 0.380 | 0.130 | 0.5125 | 0.2185 | 0.1160 | 0.1600 | 7 |
5 | 1230 | I | 0.365 | 0.270 | 0.085 | 0.1960 | 0.0825 | 0.0375 | 0.0600 | 7 |
6 | 4037 | I | 0.540 | 0.415 | 0.155 | 0.7020 | 0.3220 | 0.1670 | 0.1900 | 10 |
7 | 2296 | F | 0.535 | 0.450 | 0.135 | 0.8075 | 0.3220 | 0.1810 | 0.2500 | 13 |
8 | 1811 | M | 0.650 | 0.525 | 0.190 | 1.6125 | 0.7770 | 0.3685 | 0.3965 | 11 |
9 | 3580 | F | 0.620 | 0.480 | 0.165 | 1.0430 | 0.4835 | 0.2210 | 0.3100 | 10 |
10 | 4098 | F | 0.650 | 0.495 | 0.160 | 1.3105 | 0.5770 | 0.3315 | 0.3550 | 9 |
11 | 2466 | M | 0.425 | 0.325 | 0.120 | 0.3755 | 0.1420 | 0.1065 | 0.1050 | 9 |
12 | 2667 | F | 0.585 | 0.450 | 0.150 | 0.9380 | 0.4670 | 0.2030 | 0.2250 | 7 |
13 | 3437 | I | 0.395 | 0.300 | 0.090 | 0.2790 | 0.1340 | 0.0490 | 0.0750 | 8 |
14 | 1354 | I | 0.600 | 0.475 | 0.150 | 1.1200 | 0.5650 | 0.2465 | 0.2700 | 10 |
15 | 2122 | F | 0.435 | 0.350 | 0.120 | 0.4585 | 0.1920 | 0.1000 | 0.1300 | 11 |
16 | 249 | I | 0.345 | 0.270 | 0.110 | 0.2135 | 0.0820 | 0.0545 | 0.0700 | 7 |
17 | 1088 | I | 0.450 | 0.340 | 0.120 | 0.4925 | 0.2410 | 0.1075 | 0.1200 | 6 |
18 | 2937 | M | 0.625 | 0.515 | 0.165 | 1.2170 | 0.6670 | 0.2065 | 0.3115 | 10 |
19 | 3516 | F | 0.700 | 0.575 | 0.200 | 1.7365 | 0.7755 | 0.3965 | 0.4610 | 11 |
20 | 3890 | M | 0.515 | 0.400 | 0.140 | 0.7365 | 0.2955 | 0.1840 | 0.1850 | 16 |
21 | 129 | M | 0.710 | 0.540 | 0.165 | 1.9590 | 0.7665 | 0.2610 | 0.7800 | 18 |
22 | 2729 | I | 0.405 | 0.305 | 0.100 | 0.2680 | 0.1145 | 0.0530 | 0.0850 | 7 |
23 | 3690 | M | 0.640 | 0.500 | 0.175 | 1.2730 | 0.5065 | 0.2925 | 0.4050 | 13 |
24 | 1638 | I | 0.575 | 0.445 | 0.170 | 0.8015 | 0.3475 | 0.1465 | 0.2500 | 9 |
25 | 2337 | M | 0.560 | 0.455 | 0.165 | 0.8600 | 0.4015 | 0.1695 | 0.2450 | 11 |
26 | 3139 | I | 0.335 | 0.260 | 0.090 | 0.1835 | 0.0780 | 0.0240 | 0.0650 | 11 |
27 | 1303 | F | 0.535 | 0.410 | 0.130 | 0.7145 | 0.3350 | 0.1440 | 0.2075 | 9 |
28 | 587 | F | 0.550 | 0.410 | 0.145 | 0.8285 | 0.3095 | 0.1905 | 0.2500 | 13 |
29 | 3772 | M | 0.575 | 0.465 | 0.120 | 1.0535 | 0.5160 | 0.2185 | 0.2350 | 9 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3311 | 2229 | M | 0.370 | 0.280 | 0.095 | 0.2225 | 0.0805 | 0.0510 | 0.0750 | 7 |
3312 | 2243 | M | 0.465 | 0.360 | 0.130 | 0.5265 | 0.2105 | 0.1185 | 0.1650 | 10 |
3313 | 4100 | F | 0.675 | 0.520 | 0.175 | 1.4940 | 0.7365 | 0.3055 | 0.3700 | 9 |
3314 | 431 | M | 0.600 | 0.470 | 0.155 | 1.0360 | 0.4375 | 0.1960 | 0.3250 | 20 |
3315 | 1143 | M | 0.575 | 0.445 | 0.145 | 0.8470 | 0.4150 | 0.1945 | 0.2200 | 9 |
3316 | 3504 | F | 0.620 | 0.510 | 0.180 | 1.2330 | 0.5920 | 0.2740 | 0.3220 | 10 |
3317 | 2925 | I | 0.605 | 0.480 | 0.155 | 0.9995 | 0.4250 | 0.1985 | 0.3000 | 10 |
3318 | 1036 | F | 0.660 | 0.505 | 0.185 | 1.5280 | 0.6900 | 0.3025 | 0.4410 | 11 |
3319 | 3962 | F | 0.720 | 0.575 | 0.195 | 2.1505 | 1.0745 | 0.3820 | 0.5850 | 10 |
3320 | 1047 | F | 0.705 | 0.535 | 0.180 | 1.6850 | 0.6930 | 0.4200 | 0.4045 | 12 |
3321 | 1273 | I | 0.475 | 0.380 | 0.120 | 0.4410 | 0.1785 | 0.0885 | 0.1505 | 8 |
3322 | 3682 | M | 0.620 | 0.500 | 0.180 | 1.3915 | 0.7260 | 0.2795 | 0.3320 | 11 |
3323 | 383 | M | 0.470 | 0.375 | 0.120 | 0.5565 | 0.2260 | 0.1220 | 0.1950 | 12 |
3324 | 3205 | M | 0.335 | 0.265 | 0.095 | 0.1975 | 0.0795 | 0.0375 | 0.0700 | 9 |
3325 | 1917 | M | 0.600 | 0.475 | 0.150 | 0.9900 | 0.3860 | 0.2195 | 0.3105 | 10 |
3326 | 650 | M | 0.255 | 0.180 | 0.065 | 0.0790 | 0.0340 | 0.0140 | 0.0250 | 5 |
3327 | 4074 | I | 0.520 | 0.400 | 0.140 | 0.6220 | 0.2780 | 0.1455 | 0.1690 | 8 |
3328 | 4084 | F | 0.575 | 0.480 | 0.170 | 1.1000 | 0.5060 | 0.2485 | 0.3100 | 10 |
3329 | 340 | M | 0.575 | 0.455 | 0.145 | 1.1650 | 0.5810 | 0.2275 | 0.3000 | 14 |
3330 | 3526 | I | 0.335 | 0.260 | 0.085 | 0.1920 | 0.0970 | 0.0300 | 0.0540 | 6 |
3331 | 748 | M | 0.535 | 0.420 | 0.130 | 0.8055 | 0.3010 | 0.1810 | 0.2800 | 14 |
3332 | 3952 | I | 0.315 | 0.235 | 0.080 | 0.1800 | 0.0800 | 0.0450 | 0.0470 | 5 |
3333 | 1382 | F | 0.625 | 0.515 | 0.160 | 1.2640 | 0.5715 | 0.3260 | 0.3210 | 9 |
3334 | 579 | F | 0.630 | 0.480 | 0.175 | 1.3675 | 0.5015 | 0.3035 | 0.5150 | 17 |
3335 | 3562 | F | 0.570 | 0.420 | 0.160 | 0.8875 | 0.4315 | 0.1915 | 0.2230 | 8 |
3336 | 1311 | I | 0.550 | 0.430 | 0.145 | 0.7895 | 0.3745 | 0.1710 | 0.2230 | 11 |
3337 | 99 | F | 0.475 | 0.375 | 0.125 | 0.5785 | 0.2775 | 0.0850 | 0.1550 | 10 |
3338 | 2535 | F | 0.640 | 0.500 | 0.180 | 1.4995 | 0.5930 | 0.3140 | 0.4310 | 11 |
3339 | 3253 | I | 0.430 | 0.350 | 0.105 | 0.3660 | 0.1705 | 0.0855 | 0.1100 | 6 |
3340 | 1789 | F | 0.545 | 0.385 | 0.150 | 1.1185 | 0.5425 | 0.2445 | 0.2845 | 9 |
3341 rows × 10 columns
df.isnull().sum()
ID 0 sex 0 length 0 diameter 0 height 0 whole_weight 0 shucked_weight 0 viscera_weight 0 shell_weight 0 rings 0 dtype: int64
np.unique(df["sex"])
array(['F', 'I', 'M'], dtype=object)
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
lb.fit(df["sex"])
LabelEncoder()
df["sex"] = pd.DataFrame(lb.transform(df["sex"]),columns = ["sex"])
X = df.copy()
del X["rings"]
y = df["rings"]
df.shape
(3341, 9)
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
y_pred = [round(x) for x in y_pred]
mean_squared_error(y_test, y_pred)
# # Make predictions using the testing set
# y_pred = regr.predict(X_test)
# # The coefficients
# print('Coefficients: \n', regr.coef_)
# # The mean squared error
# print("Mean squared error: %.2f"
# % mean_squared_error(y_test, y_pred))
# # Explained variance score: 1 is perfect prediction
# print('Variance score: %.2f' % r2_score(y_test, y_pred))
# # # Plot outputs
# # plt.scatter(X_test, y_test, color='black')
# # plt.plot(X_test, y_pred, color='blue', linewidth=3)
# # plt.xticks(())
# # plt.yticks(())
# # plt.show()
(2672, 8) (2672,) (669, 8) (669,)
4.720478325859491
lm = linear_model.LinearRegression()
model = lm.fit(X_train,y_train)
model.score(X_test,y_test)
y_pred = model.predict(X_test)
mean_squared_error(y_test, y_pred)
4.800862261642637
X_kk = df_test.copy()
del X_kk["ID"]
X_kk["sex"] = pd.DataFrame(lb.transform(X_kk["sex"]),columns=["sex"])
X_train.shape
(2672, 8)
X_kk.shape
(835, 8)
prediction = model.predict(X_kk)
prediction = [round(x) for x in prediction]
submit = pd.concat([df_test["ID"],pd.DataFrame(prediction)],axis = 1)
submit.columns = ["ID","rings"]
submit.to_csv("submit.csv")
import statsmodels.api as sm
# Note the difference in argument order
model = sm.OLS(y_train, X_train).fit()
# predictions = model.predict(X_train) # make the predictions by the model
y_pred = model.predict(X_test)
mean_squared_error(y_test, y_pred)
# Print out the statistics
# model.summary()
4.938434539226715
from sklearn import linear_model
clf = linear_model.Lasso(alpha=0.1)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
mean_squared_error(y_test, y_pred)
6.589766517294562
0.29126584701343283