import seaborn as sns
# Load the titanic dataset
tips_data = sns.load_dataset('tips')
tips_data.head(15)
df = tips_data
df['sex'] = df['sex'].apply(lambda x: 1 if x == 'Male' else 0)
# 0 is female and 1 is male
df['smoker'] = df['smoker'].apply(lambda x: 1 if x == 'Yes' else 0)
# smoker is 1 non smoker is 0
df.head()
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(df[['day']])
onehot = enc.transform(df[['day']]).toarray()
cols = ['day_' + val for val in enc.categories_[0]]
df[cols] = pd.DataFrame(onehot)
df.drop(['day'], axis=1, inplace=True)
df.dropna(inplace=True)
df.head()
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(df[['time']])
onehot = enc.transform(df[['time']]).toarray()
cols = ['time_' + val for val in enc.categories_[0]]
df[cols] = pd.DataFrame(onehot)
df.drop(['time'], axis=1, inplace=True)
df.dropna(inplace=True)
df.head()
print(df.query("tip > 0").mean())
print("max for male")
print()
print(df.query("sex == 1").max())
print()
print("min for male")
print()
print(df.query("sex == 1").min())
df.head()
df['tip_ratio'] = df['tip'] / df['total_bill']
df.head()
df.drop(['total_bill'], axis=1, inplace=True)
df.head()
df.drop(['tip'], axis=1, inplace=True)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
data = df
X = data[['size', 'sex', 'smoker', 'day_Fri', 'day_Sat', 'day_Sun', 'day_Thur', 'time_Dinner', 'time_Lunch']]
y = data['tip_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("mse", mse)
print("mae", mae)
print("r2", r2)
mean_size = df['size'].mean()
# Print the mean size
print(mean_size)
mean_gender = df['tip_ratio'].mean()
# Print the mean size
print("mean_gender", mean_gender)