Time Series - Python

파이썬 자료형

a.f() 형태를 읽는 팁

a.f()는 f(a)로 생각하면 편리함
a.f(2)는 f(a,2)로 생각하면 편리함

리스트 컴프리헨션

lst = [a**2 for a in [1,2,3,4]]
print(lst)

[1, 4, 9, 16]

[A+B for A in 'XY' for B in '123']

['X1', 'X2', 'X3', 'Y1', 'Y2', 'Y3']

# if 활용
# ex) 제곱수중에서 12로 나누어 떨어지는 수만 원소로 가지는 리스트를 만들고 싶다.
[x**2 for x in range(1,50) if (x**2 % 12 == 0)]

[36, 144, 324, 576, 900, 1296, 1764, 2304]

튜플

# 변수값을 교환
a = 10; b=20
a,b = b,a
a

# for문에서의 사용
lst = [['ksko', 201821991, 'M'],
       ['iu',202254321, 'F'],
       ['hodong', 202011223, 'M']]
lst

[['ksko', 201821991, 'M'], ['iu', 202254321, 'F'], ['hodong', 202011223, 'M']]

for i in lst:
    print(i)

['ksko', 201821991, 'M']
['iu', 202254321, 'F']
['hodong', 202011223, 'M']

for name, studentid, sex in lst:
    print(name,sex)

ksko M
iu F
hodong M

# dummy variable _
for _, studentid,_ in lst:
    print(studentid)

201821991
202254321
202011223

# *연산자
for name, *args in lst:
    print(name)

ksko
iu
hodong

인덱싱고급 (스트라이딩)

스트라이딩 [start:stop:step]

lst = list('abcdefgh')
lst

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']

lst[0:8:2]

['a', 'c', 'e', 'g']

lst[::2]

['a', 'c', 'e', 'g']

# 짝수/홀수 원소 추출
print(lst[::2])
print(lst[1::2])

['a', 'c', 'e', 'g']
['b', 'd', 'f', 'h']

# step = -1 이면 뒤집는다.
lst[::-1]

['h', 'g', 'f', 'e', 'd', 'c', 'b', 'a']

# 주어진 리스트에서 x_i>80 의 조건을 만족하는 원소의 갯수는?
x = [80,60,80,90,55,85,95,100,35,70,75,65,95]
sum([i>80 for i in x])

리스트 컴프리헨션을 이용하여 \[ z = [x_1^2 + y_1^2, ... , x_8^2+y_8^2] = [x_i^2 + y_i^2 : \text{for}\ i = 1,2,3,...,8] \] 와 같은 리스트를 생성하라.

x=[1,2,1,5,6,2,4,7]
y=[3,2,4,1,2,5,6,7] 

[x[i]**2 + y[i]**2 for i in range(8)]

[10, 8, 17, 26, 40, 29, 52, 98]

아래와 같은 문자열이 있다고 하자.

test_arr = 'ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAklOUpkDHrfHY17SbrmTIpNLTGK9Tjom/BWDSUGPl+nafzlHDTYW7hdI4yZ5ew18JH4JW9jbhUFrviQzM7xlELEVf4h9lFX5QVkbPppSwg0cda3Pbv7kOdJ/MTyBlWXFCR+HAo3FXRitBqxiX1nKhXpHAZsMciLq8V6RjsNAQwdsdMFvSlVK/7XAt3FaoJoAsncM1Q9x5+3V0Ww68/eIFmb1zuUFljQJKprrX88XypNDvjYNby6vw/Pb0rwert/EnmZ+AW4OZPnTPI89ZPmVMLuayrD2cE86Z/il8b+gw3r3+1nKatmIkjn2so1d01QraTlMqVSsbxNrRFi9wrf+M7Q== schacon@mylaptop.local'

이 문자열에서 대문자의 수를 count하라.

sum([i.isupper() for i in test_arr])

# string의 .replace()기능과 리스트 컴프리헨션의 응용
lst = ['2022/09/21','2022/10/30','2022/12/25','2023/01/01','2023/01/31','2023/03/20']
[a.replace('/','-') for a in lst]

['2022-09-21',
 '2022-10-30',
 '2022-12-25',
 '2023-01-01',
 '2023-01-31',
 '2023-03-20']

Numpy

import numpy as np

np.linspace(0, 1, 12)

array([0.        , 0.09090909, 0.18181818, 0.27272727, 0.36363636,
       0.45454545, 0.54545455, 0.63636364, 0.72727273, 0.81818182,
       0.90909091, 1.        ])

np.arange(1,6)

array([1, 2, 3, 4, 5])

a = np.array([11,22,33,44,55,66])
a.reshape(2,3)

array([[11, 22, 33],
       [44, 55, 66]])

# reshape 는 a 자체를 변화시키는 것은 아님.

a # a는 그대로 있음

array([11, 22, 33, 44, 55, 66])

# reshape with -1

a.reshape(2,-1)

array([[11, 22, 33],
       [44, 55, 66]])

a.reshape(6,-1)

array([[11],
       [22],
       [33],
       [44],
       [55],
       [66]])

a.reshape(-1) # 길이가 6인 벡터로 변환

array([11, 22, 33, 44, 55, 66])

np.random.randn(10) # 표준정규분포에서 10개를 뽑음

array([-1.13998443,  0.76993493, -0.27241088, -1.15305554, -1.07096225,
        1.45305076, -0.69331096,  1.23455362,  0.29442183, -0.04995428])

np.random.rand(10) # 0~1사이에서 10개를 뽑음

array([0.52396217, 0.88733873, 0.60787126, 0.97045172, 0.93069322,
       0.06647319, 0.53918035, 0.71268956, 0.19958319, 0.05187009])

np.random.randn(4).reshape(2,2) # 표준 정규분포에서 4개를 뽑고 (2,2)로 형태변환

array([[-1.58796509, -1.44762151],
       [-1.13955349, -0.93728862]])

행렬 관련 기능

A = np.arange(4).reshape(2,2)
A

array([[0, 1],
       [2, 3]])

A.T # .T는 전치행렬을 구해줌

array([[0, 2],
       [1, 3]])

np.linalg.inv(A) # np.linalg.inv 는 역행렬을 구해줌

array([[-1.5,  0.5],
       [ 1. ,  0. ]])

A @ np.linalg.inv(A) # @는 행렬곱을 수행

array([[1., 0.],
       [0., 1.]])

Numpy: axis의 이해

두번째 차원을 바꾸고 싶다 -> 두번째 축을 바꾸고 싶다 -> axis = 1 (파이썬은 0부터 시작)

값이 바뀌는 부분이 axis

ex)

a.shape, b.shape, np.concatenate([a,b],axis=1).shape


((2, 3, 4), (2, 3, 4), (2, 6, 4))

ex)

a=np.array(range(6)).reshape(2,3)
a, a.shape

(array([[0, 1, 2],
        [3, 4, 5]]),
 (2, 3))

a.sum(axis=0), a.sum(axis=0).shape # 첫번째 축이 삭제됨

(array([3, 5, 7]), (3,))

a.sum(axis=1), a.sum(axis=1).shape # 두번째 축이 삭제됨

(array([ 3, 12]), (2,))

Pandas

행과 열의 선택

import pandas as pd

np.random.seed(43052)
att = np.random.choice(np.arange(10,21)*5,20)
rep = np.random.choice(np.arange(5,21)*5,20)
mid = np.random.choice(np.arange(0,21)*5,20)
fin = np.random.choice(np.arange(0,21)*5,20)
student_id = ['2022-12'+str(s) for s in np.random.choice(np.arange(300,501),20,replace=False)]

df2 = pd.DataFrame({'id':student_id,'att':att,'rep':rep,'mid':mid,'fin':fin})
df2.head()

	id	att	rep	mid	fin
0	2022-12380	65	55	50	40
1	2022-12370	95	100	50	80
2	2022-12363	65	90	60	30
3	2022-12488	55	80	75	80
4	2022-12312	80	30	30	100

가장 안전한 코드

df.loc[:,:]

상황: 하나의 col을 뽑으려 할 때 좋은 코드

df.att or df['att']

상황: row 슬라이싱 할 때 좋은 코드(★★★)

df[:5]

위의 상황 이외에는 df.loc[:,:]를 사용하는 것이 유리하다.

상황: column 슬라이싱 할때

df.loc[:, 'att':'mid']

상황: row + column 슬라이싱하는 가장 좋은 코드

df.loc[0:5, 'att':'mid']

상황: 조건에 맞는 col을 뽑기에 가장 좋은 코드

df.loc[:, [len(col_name)>2 for col_name in df.columns]]

상황: 조건에 맞는 row, col을 뽑기에 가장 좋은 코드

df.loc[df.att<60, [len(col_name)>2 for col_name in df.columns]]

여러 열을 뽑을때에는 리스트로 묶어주어야함. ex) df.loc[:, ['B','C']]

lambda

람다표현식(lambda expression) 자체가 하나의 오브젝트임.

lambda x: (x-2)**2 # 실행되는 순간 메모리 상에 함수 오브젝트가 저장됨

<function __main__.<lambda>(x)>

(lambda x: (x-2)**2)(5) # 입력 5 -> (5-2)^2 = 9

람다 표현식에 이름을 줄 수 있음

f = lambda x: (x-2)**2

f(2), f(4), f(6), f(-2)

(0, 4, 16, 16)

위의 코드는 아래와 같다.

def f(x):
    return (x-2)**2
f(2), f(4), f(6), f(-2)

(0, 4, 16, 16)

조건부 출력

# x,y가 입력 / x>y 일때만 x를 리턴하고 그렇지 않으면 y를 리턴. 즉 큰 값을 리턴하라는 소리
f = lambda x,y: x if x>y else y
f(1,20)

map

list(map(함수, input))

x = [1,2,3] 
f = lambda x: x+1
y = list(map(f,x))
x,y

([1, 2, 3], [2, 3, 4])

s.apply(변환함수)

df = pd.read_csv('https://raw.githubusercontent.com/guebin/DV2022/master/posts/FIFA23_official_data.csv')
df.Height.apply(lambda x: int(x[:3]))

0        189
1        179
2        172
3        181
4        172
        ... 
17655    190
17656    195
17657    190
17658    187
17659    186
Name: Height, Length: 17660, dtype: int64