데이터 분석 Dacon
데이콘 대회 참여하며 알게된 것들 정리.
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train.shape
(1459, 11)
test.shape
(715, 10)
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 1459 non-null int64
1 hour 1459 non-null int64
2 hour_bef_temperature 1457 non-null float64
3 hour_bef_precipitation 1457 non-null float64
4 hour_bef_windspeed 1450 non-null float64
5 hour_bef_humidity 1457 non-null float64
6 hour_bef_visibility 1457 non-null float64
7 hour_bef_ozone 1383 non-null float64
8 hour_bef_pm10 1369 non-null float64
9 hour_bef_pm2.5 1342 non-null float64
10 count 1459 non-null float64
dtypes: float64(9), int64(2)
memory usage: 125.5 KB
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 715 entries, 0 to 714
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 715 non-null int64
1 hour 715 non-null int64
2 hour_bef_temperature 714 non-null float64
3 hour_bef_precipitation 714 non-null float64
4 hour_bef_windspeed 714 non-null float64
5 hour_bef_humidity 714 non-null float64
6 hour_bef_visibility 714 non-null float64
7 hour_bef_ozone 680 non-null float64
8 hour_bef_pm10 678 non-null float64
9 hour_bef_pm2.5 679 non-null float64
dtypes: float64(8), int64(2)
memory usage: 56.0 KB
train['Embarked'].value_counts()
S 644
C 168
Q 77
Name: Embarked, dtype: int64
train['Embarked'].unique()
array(['S', 'C', 'Q', nan], dtype=object)
train.groupby('Pclass').mean()['Survived'].plot(kind='bar', rot = 0) # 각도 0
pd.Series.plot(kind = 'hist')
- 히스토그램: 구간별로 속해있는 row의 개수를 시각화 합니다.
- 수치형에서만 가능, 범주는 안됩니다!
train['Age'].plot(kind='hist', bins = 30) # bins 촘촘한 정도
보조선 => grid = True
train['Age'].plot(kind='hist', bins = 30, grid=True) # bins 촘촘한 정도
pd.DataFrame.plot(x, y, kind = 'scatter')
- 산점도: 두 변수간의 관계를 시각화
train.plot(x = 'Age', y = 'Fare', kind = 'scatter')
train = train.dropna()
test = test.fillna(0)
print(train.isnull().sum())
id 0
hour 0
hour_bef_temperature 0
hour_bef_precipitation 0
hour_bef_windspeed 0
hour_bef_humidity 0
hour_bef_visibility 0
hour_bef_ozone 0
hour_bef_pm10 0
hour_bef_pm2.5 0
count 0
dtype: int64
train.fillna({'hour_bef_temperature' : int(train['hour_bef_temperature'].mean())},inplace=True)
X_train = train.drop(['count'], axis=1)
Y_train = train['count']
model = DecisionTreeRegressor()
model.fit(X_train, Y_train)
DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='best')
pred = model.predict(test)
submission = pd.read_csv('data/submission.csv')
submission['count'] = pred
submission.to_csv('sub.csv',index=False)
train['석식계'][973] = 479.8605851979346
train.iloc[1,2] = 498
train.query('월<4 & 월>1 & 년 == 2020').mean()['중식계']
952.4285714285714
train.query('년>2016').groupby('월').mean()['중식계'].sort_values()
월
11 815.963415
12 834.473684
8 838.253012
7 839.523256
6 840.333333
5 849.460526
4 880.225000
10 894.666667
9 901.842857
1 934.278351
3 952.829268
2 998.042857
Name: 중식계, dtype: float64
- 특정 행 데이터 드랍
train = train.drop(train.index[[204, 224, 244, 262, 281, 306, 327, 346, 366, 392, 410, 828, 853, 872, 890, 912, 932, 955, 973, 993, 1166]])
- 특정 열 데이터 드랍
df.drop('열 이름', axis=1)