import pandas as pd #import the package


# create a Series
s = pd.Series([1,9,5,6])
print(s)

0    1
1    9
2    5
3    6
dtype: int64


# create a DataFrame
df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(df)

   0  1  2
0  1  2  3
1  4  5  6
2  7  8  9


df.dtypes

0    int64
1    int64
2    int64
dtype: object


# Dictionary with integer keys
my_dict = {1: 'A', 2: 'B'}
print(my_dict)

# Dictionary with string keys
my_dict = {'name': 'ozan', 'age': 28}
print(my_dict)

# Dictionary with mixed keys
my_dict = {'name': 'ozan', 1: ['A', 'B']}
print(my_dict)

{1: 'A', 2: 'B'}
{'name': 'ozan', 'age': 28}
{'name': 'ozan', 1: ['A', 'B']}


table = pd.DataFrame({'City' : ['Tokyo','Delhi','Shangai','Dhaka','Sao Paulo','Mexico City','Cairo','Beijing','Mumbai','Osaka'],
                      'Country': ['Japan','India','China','Bangladesh','Brazil','Mexico','Egypt','China','India','Japan'],
                      '2022 Population':[37274000,32065760,28516904,22478116,22429800,22085140,21750020,21333332,20961472,19059856],
                      '2021 Population': [37339804,31181376,27795702,21741090,22237472,21918936,21322750,20896820,20667656,19110616],
                      'Growth': [-0.18,2.84,2.59,3.39,0.86,0.76,2,2.09,1.42,-0.27]})

print(table)

          City     Country  2022 Population  2021 Population  Growth
0        Tokyo       Japan         37274000         37339804   -0.18
1        Delhi       India         32065760         31181376    2.84
2      Shangai       China         28516904         27795702    2.59
3        Dhaka  Bangladesh         22478116         21741090    3.39
4    Sao Paulo      Brazil         22429800         22237472    0.86
5  Mexico City      Mexico         22085140         21918936    0.76
6        Cairo       Egypt         21750020         21322750    2.00
7      Beijing       China         21333332         20896820    2.09
8       Mumbai       India         20961472         20667656    1.42
9        Osaka       Japan         19059856         19110616   -0.27


table.dtypes #list the data types of the series

City                object
Country             object
2022 Population      int64
2021 Population      int64
Growth             float64
dtype: object


table.shape #shows the number of column and rows of your data frame

(10, 5)


table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   City             10 non-null     object 
 1   Country          10 non-null     object 
 2   2022 Population  10 non-null     int64  
 3   2021 Population  10 non-null     int64  
 4   Growth           10 non-null     float64
dtypes: float64(1), int64(2), object(2)
memory usage: 528.0+ bytes


table.head() #shows first 5 observations


table.head(3) #shows first three observations


import numpy as np 

continent =  np.repeat(np.array(['Asia','S.America','Africa','Asia']),[4,2,1,3],axis = 0)
continent

array(['Asia', 'Asia', 'Asia', 'Asia', 'S.America', 'S.America', 'Africa',
       'Asia', 'Asia', 'Asia'], dtype='<U9')


table['continent']  = continent 
#df_name['new column name'] = new series
table.head()


table.columns # list the columns in the data

Index(['City', 'Country', '2022 Population', '2021 Population', 'Growth',
       'continent'],
      dtype='object')


table['City']
#df_name['Column name']

0          Tokyo
1          Delhi
2        Shangai
3          Dhaka
4      Sao Paulo
5    Mexico City
6          Cairo
7        Beijing
8         Mumbai
9          Osaka
Name: City, dtype: object


table[['City','Country']] 
#for multiple extraction, create a list object.


table.loc[0] #first row of the data

City                  Tokyo
Country               Japan
2022 Population    37274000
2021 Population    37339804
Growth                -0.18
continent              Asia
Name: 0, dtype: object


table.iloc[[0,3,8]] #1st, 4th, 9th rows of the data


table['City'].iloc[2] #the name of the third city

'Shangai'


table[table['2022 Population']>24000000]


#If only list the city names. 

table['City'].loc[table['2022 Population']>24000000]

0      Tokyo
1      Delhi
2    Shangai
Name: City, dtype: object


table.rename({'continent': 'Continent'},axis = "columns")


teams = pd.read_csv("teams.csv")
#teams = pd.read_csv("Teams.csv",sep = ";")
#dataname = pd.data.csv("filename.csv",sep = ";")
#sep argument specifies a custom delimiter for the CSV input, where the default is comma.
teams.head()


teams_excel = pd.read_excel('teams.xlsx')
teams_excel.head()


teams_url = pd.read_csv('https://metustat112.github.io/teams.csv')
teams_url.head()


teams.head() #shows the first fice observations
#dataname.function()


teams_new = teams.drop("Unnamed: 0", axis=1) #axis = 1 is for column. axis = 0 is for row removing. 
teams_new.head()


teams_new.to_csv('teamsnew.csv')


teams_by_continent = teams_new.groupby('continent')
teams_by_continent.first()
#Compute the first non-null entry of each column.


teams_by_continent.get_group(('Africa')) #the Africa countries in WC 2010


teams_by_continent_and_win = teams_new.groupby(['continent','wins'])
teams_new.groupby(['continent','wins']).get_group(('Africa',0)) #the most unsuccessful African teams in WC 2010


teams_new.describe() #shows the several descriptive statistics of the numerical variables.


teams_new.describe().transpose() #by transposing, you can have a better view


teams_new.describe(include=['O'])


teams_new['wins'].values.mean() #the average wins

1.4375


teams_new['continent'].value_counts() #frequency distribution of continent

Europe           13
South America     7
Africa            6
Asia              3
Australia         2
North America     1
Name: continent, dtype: int64


teams_by_continent.aggregate('mean') #calculate the average of each variable by continent.


teams_by_continent.aggregate('count') #shows the frequency of each column by continent.


teams_by_continent.aggregate({
    "wins": "mean",
    "goalsFor": "max",
    "goalsAgainst": "min"
})


players  = pd.read_csv('Players.csv')
players.head()


import pandas as pd


missing_data = pd.read_csv('https://metustat112.github.io/stolen_cars.csv',sep = ";")
print(missing_data)

                 Car   Number
0  Chevrolet Pick-Up  48206.0
1       Ford Pick-Up  47999.0
2        Honda Civic  31673.0
3       Honda Accord  30274.0
4       Toyota Camry  17270.0
5        GMC Pick-Up      NaN
6      Nissan Altima  14108.0
7         Honda CR-V      NaN


missing_data.isnull()


missing_data.notnull()


missing_data.isnull().sum()

Car       0
Number    2
dtype: int64


missing_data.isnull().sum().sum() #total number of missing observations

2


missing_data.dropna(axis = 0) #axis = 0 removes the rows with na 
#axis = 1 drops the columns with na


missing_data.fillna(0)


mean = missing_data['Number'].mean()
median = missing_data['Number'].median()
mode = missing_data['Number'].mode()
print('Mean:',mean,'Median:',median,'Mode:',mode)

Mean: 31588.333333333332 Median: 30973.5 Mode: 0    14108.0
1    17270.0
2    30274.0
3    31673.0
4    47999.0
5    48206.0
Name: Number, dtype: float64


missing_data['Number'].fillna(mean) #mean imputation

0    48206.000000
1    47999.000000
2    31673.000000
3    30274.000000
4    17270.000000
5    31588.333333
6    14108.000000
7    31588.333333
Name: Number, dtype: float64


missing_data['Number'].fillna(median) #mean imputation

0    48206.0
1    47999.0
2    31673.0
3    30274.0
4    17270.0
5    30973.5
6    14108.0
7    30973.5
Name: Number, dtype: float64


missing_data['Number'].fillna(mode) #mean imputation

0    48206.0
1    47999.0
2    31673.0
3    30274.0
4    17270.0
5    48206.0
6    14108.0
7        NaN
Name: Number, dtype: float64


missing_data.replace(to_replace=np.nan,value=0) #replaces the all missing values


tao = pd.read_csv('tao.csv')
tao.head()
tao_new = tao.drop(['Unnamed: 0'],axis = 1)
tao_new.head()

	ranking	games	wins	draws	losses	goalsFor	goalsAgainst	yellowCards	redCards
count	32.000000	32.000000	32.000000	32.000000	32.000000	32.00000	32.000000	32.000000	32.000000
mean	26.031250	3.875000	1.437500	1.000000	1.437500	4.34375	4.343750	7.156250	0.500000
std	24.233387	1.070122	1.522678	0.879883	0.715609	3.28839	2.208625	2.760427	0.718421
min	1.000000	3.000000	0.000000	0.000000	0.000000	0.00000	1.000000	2.000000	0.000000
25%	8.750000	3.000000	0.750000	0.000000	1.000000	2.00000	3.000000	5.000000	0.000000
50%	19.500000	3.500000	1.000000	1.000000	1.000000	3.00000	5.000000	7.000000	0.000000
75%	32.500000	4.250000	2.000000	1.250000	2.000000	5.25000	5.000000	8.250000	1.000000
max	105.000000	6.000000	6.000000	3.000000	3.000000	13.00000	12.000000	15.000000	2.000000

	count	mean	std	min	25%	50%	75%	max
ranking	32.0	26.03125	24.233387	1.0	8.75	19.5	32.50	105.0
games	32.0	3.87500	1.070122	3.0	3.00	3.5	4.25	6.0
wins	32.0	1.43750	1.522678	0.0	0.75	1.0	2.00	6.0
draws	32.0	1.00000	0.879883	0.0	0.00	1.0	1.25	3.0
losses	32.0	1.43750	0.715609	0.0	1.00	1.0	2.00	3.0
goalsFor	32.0	4.34375	3.288390	0.0	2.00	3.0	5.25	13.0
goalsAgainst	32.0	4.34375	2.208625	1.0	3.00	5.0	5.00	12.0
yellowCards	32.0	7.15625	2.760427	2.0	5.00	7.0	8.25	15.0
redCards	32.0	0.50000	0.718421	0.0	0.00	0.0	1.00	2.0

	ranking	games	wins	draws	losses	goalsFor	goalsAgainst	yellowCards	redCards
continent
Africa	35.333333	3.333333	0.666667	1.000000	1.666667	2.833333	4.000000	5.666667	0.666667
Asia	65.666667	3.666667	1.000000	0.666667	2.000000	3.666667	7.333333	5.000000	0.000000
Australia	49.000000	3.000000	0.500000	2.000000	0.500000	2.500000	4.000000	6.500000	1.000000
Europe	14.153846	3.923077	1.769231	0.769231	1.384615	4.846154	3.846154	7.538462	0.384615
North America	14.000000	4.000000	1.000000	2.000000	1.000000	5.000000	5.000000	9.000000	0.000000
South America	18.285714	4.571429	2.000000	1.142857	1.428571	5.428571	4.285714	8.571429	0.714286

	surname	team	position	minutes	shots	passes	tackles	saves
0	Abdoun	Algeria	midfielder	16	0	6	0	0
1	Belhadj	Algeria	defender	270	1	146	8	0
2	Boudebouz	Algeria	midfielder	74	3	28	1	0
3	Bougherra	Algeria	defender	270	1	89	11	0
4	Chaouchi	Algeria	goalkeeper	90	0	17	0	2

	Car	Number
0	False	False
1	False	False
2	False	False
3	False	False
4	False	False
5	False	True
6	False	False
7	False	True

Stat 112 - Recitation 8¶

Introduction to Pandas¶

Creating Data Frame from Scratch¶

Exercise 1¶

Exercise 2¶

Creating Data Frame by Importing Data¶

Example 3¶

Missing Data¶

Example 4¶

	City	Country	2022 Population	2021 Population	Growth
0	Tokyo	Japan	37274000	37339804	-0.18
1	Delhi	India	32065760	31181376	2.84
2	Shangai	China	28516904	27795702	2.59
3	Dhaka	Bangladesh	22478116	21741090	3.39
4	Sao Paulo	Brazil	22429800	22237472	0.86

	Unnamed: 0	team	ranking	continent	games	wins	draws	losses	goalsFor	goalsAgainst	yellowCards	redCards
0	0	Brazil	1	South America	5	3	1	1	9	4	7	2
1	1	Spain	2	Europe	6	5	0	1	7	2	3	0
2	2	Portugal	3	Europe	4	1	2	1	7	1	8	1
3	3	Netherlands	4	Europe	6	6	0	0	12	5	15	0
4	4	Italy	5	Europe	3	0	2	1	4	5	5	0

	team	ranking	games	wins	draws	losses	goalsFor	goalsAgainst	yellowCards	redCards
continent
Africa	Cameroon	19	3	0	0	3	2	5	5	0
Asia	Japan	45	4	2	1	1	4	2	7	0
Australia	Australia	20	3	1	1	1	3	6	7	2
Europe	Spain	2	6	5	0	1	7	2	3	0
North America	USA	14	4	1	2	1	5	5	9	0
South America	Brazil	1	5	3	1	1	9	4	7	2

	team	ranking	continent	games	wins	draws	losses	goalsFor	goalsAgainst	yellowCards	redCards
15	Cameroon	19	Africa	3	0	0	3	2	5	5	0
17	Nigeria	21	Africa	3	0	1	2	3	5	5	1
20	Ivory Coast	27	Africa	3	1	1	1	4	3	5	0
21	Algeria	30	Africa	3	0	1	2	0	2	4	2
23	Ghana	32	Africa	5	2	2	1	5	4	11	0
30	South Africa	83	Africa	3	1	1	1	3	5	4	1

	Car	Number
0	Chevrolet Pick-Up	48206.0
1	Ford Pick-Up	47999.0
2	Honda Civic	31673.0
3	Honda Accord	30274.0
4	Toyota Camry	17270.0
6	Nissan Altima	14108.0

	Year	Longitude	Sea.Surface.Temp	Air.Temp	Humidity	UWind	VWind
0	1997	-110	27.590000	27.15	79.599998	-6.4	5.4
1	1997	-110	27.549999	27.02	75.800003	-5.3	5.3
2	1997	-110	27.570000	27.00	76.500000	-5.1	4.5
3	1997	-110	27.620001	26.93	76.199997	-4.9	2.5
4	1997	-110	27.650000	26.84	76.400002	-3.5	4.1