티스토리 뷰

맥주
In [1]:
import pandas as pd
import numpy as np
In [2]:
df=pd.read_csv('finally_beer.csv',encoding='utf-8-sig', names=['user', 'beer_name', 'brewery','beer_style','score','date'])
df.head()
Out[2]:
user beer_name brewery beer_style score date
0 3355 King Two Fisted Old Ale King Brewing Company Old Ale 1.2 4/26/2006
1 3355 Flying Dog Snake Dog IPA (through 2007) Flying Dog Brewery India Pale Ale (IPA) 3.3 3/18/2006
2 3355 Bluegrass American Pale Ale Bluegrass Brewing Company American Pale Ale 3.3 2/26/2006
3 3355 Flying Dog Old Scratch Amber Lager Flying Dog Brewery California Common 2.7 2/18/2006
4 3355 Casco Bay Pale Ale Casco Bay Brewing American Pale Ale 3.1 2/5/2006
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8281246 entries, 0 to 8281245
Data columns (total 6 columns):
user          object
beer_name     object
brewery       object
beer_style    object
score         float64
date          object
dtypes: float64(1), object(5)
memory usage: 379.1+ MB
In [4]:
df.isnull().sum()
Out[4]:
user          0
beer_name     0
brewery       0
beer_style    0
score         0
date          0
dtype: int64
In [5]:
len(df['beer_name'].unique())
Out[5]:
507864
In [6]:
len(df['user'].unique()), df['user'].unique()
Out[6]:
(10377, array(['3355', '3368', '3388', ..., '569050', '569699', '570319'],
       dtype=object))
In [7]:
len(df['beer_style'].unique()),df['beer_style'].unique()
Out[7]:
(94, array(['Old Ale', 'India Pale Ale (IPA)', 'American Pale Ale',
        'California Common', 'Amber Ale', 'English Strong Ale',
        'Amber Lager/Vienna', 'Abbey Tripel', 'Zwickel/Keller/Landbier',
        'Pale Lager', 'Belgian Strong Ale', 'Golden Ale/Blond Ale',
        'Premium Bitter/ESB', 'Premium Lager', 'Pilsener',
        'Dortmunder/Helles', 'Bitter', 'Czech Pilsner (Světlý)', 'Kölsch',
        'German Hefeweizen', 'English Pale Ale', 'Cream Ale', 'Wheat Ale',
        'Porter', 'American Strong Ale ', 'Spice/Herb/Vegetable',
        'Abt/Quadrupel', 'Fruit Beer', 'Dry Stout', 'Abbey Dubbel',
        'Oktoberfest/Märzen', 'Heller Bock', 'Scotch Ale', 'Altbier',
        'Brown Ale', 'Imperial IPA', 'Stout', 'Barley Wine', 'Doppelbock',
        'Irish Ale', 'Weizenbock', 'Foreign Stout', 'Dunkel/Tmavý',
        'Sweet Stout', 'Dunkler Bock', 'Imperial Stout', 'Witbier',
        'Dunkelweizen', 'Malt Liquor', 'Perry', 'Lambic Style - Fruit',
        'Imperial Pils/Strong Pale Lager', 'Imperial Porter',
        'Low Alcohol', 'Lambic Style - Gueuze', 'Eisbock',
        'Bière de Garde', 'Sour Red/Brown', 'Smoked', 'Belgian Ale',
        'German Kristallweizen', 'Scottish Ale', 'Baltic Porter',
        'Specialty Grain', 'Traditional Ale', 'Mild Ale', 'Cider',
        'Session IPA', 'Sour/Wild Ale', 'Mead', 'Saison',
        'India Style Lager', 'Black IPA', 'Grodziskie/Gose/Lichtenhainer',
        'Berliner Weisse', 'Schwarzbier', 'Radler/Shandy',
        'Saké - Tokubetsu', 'Ice Cider/Ice Perry', 'Saké - Infused',
        'Saké - Koshu', 'Saké - Namasaké', 'Sahti/Gotlandsdricke/Koduõlu',
        'Lambic Style - Unblended', 'Saké - Junmai', 'Polotmavý',
        'Saké - Ginjo', 'Lambic Style - Faro', 'Saké - Genshu',
        'Saké - Daiginjo', 'Saké - Nigori', 'Saké - Futsu-shu',
        'Saké - Taru', 'Saké - Honjozo'], dtype=object))
In [8]:
df['user'].drop_duplicates()
Out[8]:
0            3355
84           3368
142          3388
260          3390
322          3313
488          3389
712          3395
901          3319
997          3343
9474         3346
9581         3351
9649            2
9708            3
9955           46
12608          57
12709          63
16225          65
16336          66
16494          69
17980          74
18045          86
18096         123
18265         129
18423         152
18593         154
20918         159
20972         167
21148         180
21490         181
35677         189
            ...  
8277859    553416
8277956    553694
8278009    555203
8278188    556541
8278241    556550
8278337    557173
8278560    557324
8278613    557340
8278694    557348
8278749    557496
8278834    557747
8279210    558840
8279268    559121
8279320    559794
8279416    559886
8279477    560010
8279535    560456
8279695    562572
8279767    562939
8279908    563003
8280045    564455
8280099    565566
8280324    566518
8280382    566526
8280449    568034
8280501    568064
8280940    568998
8280998    569050
8281052    569699
8281183    570319
Name: user, Length: 10377, dtype: object
In [9]:
df['user']=df['user'].astype(int)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8281246 entries, 0 to 8281245
Data columns (total 6 columns):
user          int32
beer_name     object
brewery       object
beer_style    object
score         float64
date          object
dtypes: float64(1), int32(1), object(4)
memory usage: 347.5+ MB
In [10]:
df['score'].drop_duplicates()
Out[10]:
0      1.2
1      3.3
3      2.7
4      3.1
5      1.5
6      3.2
8      3.5
9      2.8
10     0.8
11     3.0
18     2.0
21     3.4
26     1.8
29     2.5
30     2.1
34     2.9
35     3.8
42     2.6
47     1.9
48     3.6
49     2.3
58     4.1
60     3.7
64     4.2
65     0.6
71     2.4
77     3.9
80     0.5
81     0.9
86     4.5
87     4.3
88     5.0
90     4.4
106    2.2
112    1.3
138    4.8
145    1.7
154    1.4
163    4.6
169    1.0
215    4.0
268    1.1
280    0.7
295    1.6
477    4.9
897    4.7
Name: score, dtype: float64
In [11]:
df['월']=df['date'].apply(lambda e: e.strip().split('/')[0])
df['일']=df['date'].apply(lambda e: e.strip().split('/')[1])
df['년']=df['date'].apply(lambda e: e.strip().split('/')[2])
df.head()
Out[11]:
user beer_name brewery beer_style score date
0 3355 King Two Fisted Old Ale King Brewing Company Old Ale 1.2 4/26/2006 4 26 2006
1 3355 Flying Dog Snake Dog IPA (through 2007) Flying Dog Brewery India Pale Ale (IPA) 3.3 3/18/2006 3 18 2006
2 3355 Bluegrass American Pale Ale Bluegrass Brewing Company American Pale Ale 3.3 2/26/2006 2 26 2006
3 3355 Flying Dog Old Scratch Amber Lager Flying Dog Brewery California Common 2.7 2/18/2006 2 18 2006
4 3355 Casco Bay Pale Ale Casco Bay Brewing American Pale Ale 3.1 2/5/2006 2 5 2006
In [15]:
df['월']=df['월'].astype(int)
df['일']=df['일'].astype(int)
df['년']=df['년'].astype(int)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8281246 entries, 0 to 8281245
Data columns (total 9 columns):
user          int32
beer_name     object
brewery       object
beer_style    object
score         float64
date          object
월             int32
일             int32
년             int32
dtypes: float64(1), int32(4), object(4)
memory usage: 442.3+ MB
In [29]:
df['date']=pd.to_datetime(df['date'])
df.head()
Out[29]:
user beer_name brewery beer_style score date
0 3355 King Two Fisted Old Ale King Brewing Company Old Ale 1.2 2006-04-26 4 26 2006
1 3355 Flying Dog Snake Dog IPA (through 2007) Flying Dog Brewery India Pale Ale (IPA) 3.3 2006-03-18 3 18 2006
2 3355 Bluegrass American Pale Ale Bluegrass Brewing Company American Pale Ale 3.3 2006-02-26 2 26 2006
3 3355 Flying Dog Old Scratch Amber Lager Flying Dog Brewery California Common 2.7 2006-02-18 2 18 2006
4 3355 Casco Bay Pale Ale Casco Bay Brewing American Pale Ale 3.1 2006-02-05 2 5 2006
In [35]:
df2=df.sort_values(by=['user','date'],axis=0)
df2.head(10)
Out[35]:
user beer_name brewery beer_style score date
9707 2 Worthington's White Shield Molson Coors UK (Molson Coors) Premium Bitter/ESB 4.4 2000-04-16 4 16 2000
9706 2 Rogue Dry Hopped St. Rogue Red Ale Rogue Ales Amber Ale 4.3 2000-04-23 4 23 2000
9701 2 Maclay Oat Malt Stout Clockwork (Maclay) Stout 2.4 2000-04-28 4 28 2000
9702 2 Newcastle Brown Ale Heineken Nederland Brown Ale 3.9 2000-04-28 4 28 2000
9703 2 Rogue Mocha Porter Rogue Ales Porter 4.9 2000-04-28 4 28 2000
9704 2 Flying Dog Doggie Style Pale Ale Flying Dog Brewery American Pale Ale 4.4 2000-04-28 4 28 2000
9705 2 Fuller's India Pale Ale (Bottle/Keg) Fuller's India Pale Ale (IPA) 3.5 2000-04-28 4 28 2000
9687 2 Samuel Adams Pale Ale Boston Beer Company English Pale Ale 3.6 2000-05-03 5 3 2000
9688 2 Dogwood Breakdown IPA Dogwood Brewing Company (GA) India Pale Ale (IPA) 4.3 2000-05-03 5 3 2000
9689 2 King and Barnes IPA King and Barnes Premium Bitter/ESB 4.3 2000-05-03 5 3 2000
In [38]:
df2.to_csv('finally_beer_1.csv',index=False)
공지사항
최근에 올라온 글
최근에 달린 댓글
Total
Today
Yesterday
링크
«   2025/01   »
1 2 3 4
5 6 7 8 9 10 11
12 13 14 15 16 17 18
19 20 21 22 23 24 25
26 27 28 29 30 31
글 보관함