import pandas as pd
data = pd.read_csv('https://gist.githubusercontent.com/l8doku/f291d09e88c866d3a044212b45cb5e23/raw/5d7e7aa3ca4fd03ab3c051b4adc304c4ceb01e81/titanic_train.csv')
# from titanic
data.head(10)
data.info()
data.describe()
Change the code in the corresponding cells or write your own
# change this code
data['Age'].value_counts()
# change this code
# see what happens in margins=True
pd.crosstab(data['SibSp'], data['Survived'], margins=False)
mean_fare = # your code, method .mean()
# std_fare = # same type of method
# median_fare = # same type of method
print(f'{mean_fare:.3}')
# other prints
# use the same logical statement indexing as on previous workshops to filter old and young
young = data[data['Age'] < 30]
# then use .mean() over the "Survived" column to get the average chance of survival.
# use print(f'{variable:.3}') to only output up to two decimal places
# use the same logic as in task 4
# name has structure
# last_name, Mr. first_name (middle_name)
# three parts separated by spaces
def get_first_name(full_name):
first_name = full_name.split()[2]
return first_name
# use this column of data to solve the task
data['Name'].apply(get_first_name)
# use logic similar to task 6
# create a new column with surname length.
# Then filter out passengers by class and find the average name length of each class
# you may use a lambda function instead of a regular one
# your code to obtain a column of rescued passengers
# your code to obtain a column of victims
rescued.hist(color="green", label='Survived')
# histogram for victims
import matplotlib.pyplot as plt
plt.title('Ticket fare for survived passengers and victims')
plt.xlabel('Pounds')
plt.ylabel('Frequency')
plt.legend();
# one way to solve:
# get different values for sex/class with data['Pclass'].unique()
# iterate over various values of unique sets
# filter the data by current iterations
# compute average age
# another way to solve: groupby
# for (pclass, sex), filtered_dt in data.groupby(['Pclass', 'Sex']):
File - Settings (Preferences) - Project: [Project-name] - Python Interpreter - +
- enter "seaborn" - Install Package
or
!pip install seaborn
import seaborn as sns
Age
, Fare
, SibSp
, Parch
, Embarked
и Survived
.¶sns.pairplot(data[['Survived', 'Age', 'Fare', 'SibSp',
'Parch', 'Embarked']]);
Fare
) depend on the cabin class (Pclass
)? Plot boxplot and catplot.¶sns.boxplot(x='Pclass', y='Fare', data=data)
sns.catplot(x='Pclass', y='Fare', data=data)
# s means marker size
sns.catplot(x='Pclass', y='Fare', data=data, kind='swarm', s=2)
Fare
) depend on the number of siblings and spouses (SibSp
)? Plot boxplot and catplot.¶# your code
Fare
that are too far away from the average.¶# the algorithm to remove outliers is the following:
# compute average value .mean()
# compute standard deviation: .std()
# filter data by the following criteria:
# if the data is too far away from average (more than 2 standard deviations away), discard it
# otherwise, keep it
# do this for all classes separately
# plot filtered data