Untitled.ipynb

YI

Uploaded on: April 21, 2019, 10:22 a.m.
.python

In [1]:
import numpy as np
import pandas as pd
In [4]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
In [3]:
df
Out[3]:
ID sex length diameter height whole_weight shucked_weight viscera_weight shell_weight rings
0 1721 M 0.655 0.550 0.180 1.2740 0.5860 0.2810 0.3650 10
1 3757 I 0.520 0.410 0.140 0.6990 0.3395 0.1290 0.1945 10
2 3723 I 0.470 0.355 0.120 0.4915 0.1765 0.1125 0.1325 9
3 2005 I 0.395 0.290 0.095 0.3000 0.1580 0.0680 0.0780 7
4 1279 I 0.495 0.380 0.130 0.5125 0.2185 0.1160 0.1600 7
5 1230 I 0.365 0.270 0.085 0.1960 0.0825 0.0375 0.0600 7
6 4037 I 0.540 0.415 0.155 0.7020 0.3220 0.1670 0.1900 10
7 2296 F 0.535 0.450 0.135 0.8075 0.3220 0.1810 0.2500 13
8 1811 M 0.650 0.525 0.190 1.6125 0.7770 0.3685 0.3965 11
9 3580 F 0.620 0.480 0.165 1.0430 0.4835 0.2210 0.3100 10
10 4098 F 0.650 0.495 0.160 1.3105 0.5770 0.3315 0.3550 9
11 2466 M 0.425 0.325 0.120 0.3755 0.1420 0.1065 0.1050 9
12 2667 F 0.585 0.450 0.150 0.9380 0.4670 0.2030 0.2250 7
13 3437 I 0.395 0.300 0.090 0.2790 0.1340 0.0490 0.0750 8
14 1354 I 0.600 0.475 0.150 1.1200 0.5650 0.2465 0.2700 10
15 2122 F 0.435 0.350 0.120 0.4585 0.1920 0.1000 0.1300 11
16 249 I 0.345 0.270 0.110 0.2135 0.0820 0.0545 0.0700 7
17 1088 I 0.450 0.340 0.120 0.4925 0.2410 0.1075 0.1200 6
18 2937 M 0.625 0.515 0.165 1.2170 0.6670 0.2065 0.3115 10
19 3516 F 0.700 0.575 0.200 1.7365 0.7755 0.3965 0.4610 11
20 3890 M 0.515 0.400 0.140 0.7365 0.2955 0.1840 0.1850 16
21 129 M 0.710 0.540 0.165 1.9590 0.7665 0.2610 0.7800 18
22 2729 I 0.405 0.305 0.100 0.2680 0.1145 0.0530 0.0850 7
23 3690 M 0.640 0.500 0.175 1.2730 0.5065 0.2925 0.4050 13
24 1638 I 0.575 0.445 0.170 0.8015 0.3475 0.1465 0.2500 9
25 2337 M 0.560 0.455 0.165 0.8600 0.4015 0.1695 0.2450 11
26 3139 I 0.335 0.260 0.090 0.1835 0.0780 0.0240 0.0650 11
27 1303 F 0.535 0.410 0.130 0.7145 0.3350 0.1440 0.2075 9
28 587 F 0.550 0.410 0.145 0.8285 0.3095 0.1905 0.2500 13
29 3772 M 0.575 0.465 0.120 1.0535 0.5160 0.2185 0.2350 9
... ... ... ... ... ... ... ... ... ... ...
3311 2229 M 0.370 0.280 0.095 0.2225 0.0805 0.0510 0.0750 7
3312 2243 M 0.465 0.360 0.130 0.5265 0.2105 0.1185 0.1650 10
3313 4100 F 0.675 0.520 0.175 1.4940 0.7365 0.3055 0.3700 9
3314 431 M 0.600 0.470 0.155 1.0360 0.4375 0.1960 0.3250 20
3315 1143 M 0.575 0.445 0.145 0.8470 0.4150 0.1945 0.2200 9
3316 3504 F 0.620 0.510 0.180 1.2330 0.5920 0.2740 0.3220 10
3317 2925 I 0.605 0.480 0.155 0.9995 0.4250 0.1985 0.3000 10
3318 1036 F 0.660 0.505 0.185 1.5280 0.6900 0.3025 0.4410 11
3319 3962 F 0.720 0.575 0.195 2.1505 1.0745 0.3820 0.5850 10
3320 1047 F 0.705 0.535 0.180 1.6850 0.6930 0.4200 0.4045 12
3321 1273 I 0.475 0.380 0.120 0.4410 0.1785 0.0885 0.1505 8
3322 3682 M 0.620 0.500 0.180 1.3915 0.7260 0.2795 0.3320 11
3323 383 M 0.470 0.375 0.120 0.5565 0.2260 0.1220 0.1950 12
3324 3205 M 0.335 0.265 0.095 0.1975 0.0795 0.0375 0.0700 9
3325 1917 M 0.600 0.475 0.150 0.9900 0.3860 0.2195 0.3105 10
3326 650 M 0.255 0.180 0.065 0.0790 0.0340 0.0140 0.0250 5
3327 4074 I 0.520 0.400 0.140 0.6220 0.2780 0.1455 0.1690 8
3328 4084 F 0.575 0.480 0.170 1.1000 0.5060 0.2485 0.3100 10
3329 340 M 0.575 0.455 0.145 1.1650 0.5810 0.2275 0.3000 14
3330 3526 I 0.335 0.260 0.085 0.1920 0.0970 0.0300 0.0540 6
3331 748 M 0.535 0.420 0.130 0.8055 0.3010 0.1810 0.2800 14
3332 3952 I 0.315 0.235 0.080 0.1800 0.0800 0.0450 0.0470 5
3333 1382 F 0.625 0.515 0.160 1.2640 0.5715 0.3260 0.3210 9
3334 579 F 0.630 0.480 0.175 1.3675 0.5015 0.3035 0.5150 17
3335 3562 F 0.570 0.420 0.160 0.8875 0.4315 0.1915 0.2230 8
3336 1311 I 0.550 0.430 0.145 0.7895 0.3745 0.1710 0.2230 11
3337 99 F 0.475 0.375 0.125 0.5785 0.2775 0.0850 0.1550 10
3338 2535 F 0.640 0.500 0.180 1.4995 0.5930 0.3140 0.4310 11
3339 3253 I 0.430 0.350 0.105 0.3660 0.1705 0.0855 0.1100 6
3340 1789 F 0.545 0.385 0.150 1.1185 0.5425 0.2445 0.2845 9

3341 rows × 10 columns

In [11]:
df.isnull().sum()
Out[11]:
ID                0
sex               0
length            0
diameter          0
height            0
whole_weight      0
shucked_weight    0
viscera_weight    0
shell_weight      0
rings             0
dtype: int64
In [14]:
np.unique(df["sex"])
Out[14]:
array(['F', 'I', 'M'], dtype=object)
In [15]:
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
lb.fit(df["sex"])
Out[15]:
LabelEncoder()
In [19]:
df["sex"] = pd.DataFrame(lb.transform(df["sex"]),columns = ["sex"])
In [29]:
X = df.copy()
del X["rings"]
y = df["rings"]
In [53]:
df.shape
Out[53]:
(3341, 9)
In [98]:
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
y_pred = [round(x) for x in y_pred]

mean_squared_error(y_test, y_pred)
# # Make predictions using the testing set
# y_pred = regr.predict(X_test)

# # The coefficients
# print('Coefficients: \n', regr.coef_)
# # The mean squared error
# print("Mean squared error: %.2f"
#       % mean_squared_error(y_test, y_pred))
# # Explained variance score: 1 is perfect prediction
# print('Variance score: %.2f' % r2_score(y_test, y_pred))

# # # Plot outputs
# # plt.scatter(X_test, y_test,  color='black')
# # plt.plot(X_test, y_pred, color='blue', linewidth=3)

# # plt.xticks(())
# # plt.yticks(())

# # plt.show()
(2672, 8) (2672,)
(669, 8) (669,)
Out[98]:
4.720478325859491
In [77]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train,y_train)
model.score(X_test,y_test)

y_pred = model.predict(X_test)

mean_squared_error(y_test, y_pred)
Out[77]:
4.800862261642637
In [82]:
X_kk = df_test.copy()
del X_kk["ID"]
In [83]:
X_kk["sex"] = pd.DataFrame(lb.transform(X_kk["sex"]),columns=["sex"])
In [84]:
X_train.shape
Out[84]:
(2672, 8)
In [85]:
X_kk.shape
Out[85]:
(835, 8)
In [99]:
prediction = model.predict(X_kk)
prediction = [round(x) for x in prediction]
In [100]:
submit = pd.concat([df_test["ID"],pd.DataFrame(prediction)],axis = 1)
In [101]:
submit.columns = ["ID","rings"]
In [103]:
submit.to_csv("submit.csv")
In [78]:
import statsmodels.api as sm
# Note the difference in argument order
model = sm.OLS(y_train, X_train).fit()
# predictions = model.predict(X_train) # make the predictions by the model

y_pred = model.predict(X_test)

mean_squared_error(y_test, y_pred)
# Print out the statistics
# model.summary()
Out[78]:
4.938434539226715
In [79]:
from sklearn import linear_model
clf = linear_model.Lasso(alpha=0.1)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
mean_squared_error(y_test, y_pred)
Out[79]:
6.589766517294562
In [40]:
Out[40]:
0.29126584701343283
In [ ]: