import seaborn as sns

# Load the titanic dataset
tips_data = sns.load_dataset('tips')




tips_data.head(15)

Anaylzing the data set

This data set analyzes the amount of tip and includes different variables such as the gender, smoker, day and time, and it also includes the size
The tip varies with total bill however the other factors may have an impact

df = tips_data

df['sex'] = df['sex'].apply(lambda x: 1 if x == 'Male' else 0)
# 0 is female and 1 is male
df['smoker'] = df['smoker'].apply(lambda x: 1 if x == 'Yes' else 0)
# smoker is 1 non smoker is 0
df.head()

from sklearn.preprocessing import OneHotEncoder
import pandas as pd
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(df[['day']])
onehot = enc.transform(df[['day']]).toarray()
cols = ['day_' + val for val in enc.categories_[0]]
df[cols] = pd.DataFrame(onehot)
df.drop(['day'], axis=1, inplace=True)
df.dropna(inplace=True)

df.head()

from sklearn.preprocessing import OneHotEncoder
import pandas as pd
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(df[['time']])
onehot = enc.transform(df[['time']]).toarray()
cols = ['time_' + val for val in enc.categories_[0]]
df[cols] = pd.DataFrame(onehot)
df.drop(['time'], axis=1, inplace=True)
df.dropna(inplace=True)

df.head()

print(df.query("tip > 0").mean())

total_bill     19.785943
tip             2.998279
size            2.569672
day_Fri         0.077869
day_Sat         0.356557
day_Sun         0.311475
day_Thur        0.254098
time_Dinner     0.721311
time_Lunch      0.278689
dtype: float64

/tmp/ipykernel_12209/703362537.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  print(df.query("tip > 0").mean())

print("max for male")
print()
print(df.query("sex == 1").max())
print()
print("min for male")
print()
print(df.query("sex == 1").min())

max for male

total_bill     50.81
tip            10.00
size            6.00
day_Fri         1.00
day_Sat         1.00
day_Sun         1.00
day_Thur        1.00
time_Dinner     1.00
time_Lunch      1.00
dtype: float64

min for male

total_bill     7.25
tip            1.00
size           1.00
day_Fri        0.00
day_Sat        0.00
day_Sun        0.00
day_Thur       0.00
time_Dinner    0.00
time_Lunch     0.00
dtype: float64

/tmp/ipykernel_12209/1846147331.py:3: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  print(df.query("sex == 1").max())
/tmp/ipykernel_12209/1846147331.py:7: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  print(df.query("sex == 1").min())

df.head()
df['tip_ratio'] = df['tip'] / df['total_bill']
df.head()

df.drop(['total_bill'], axis=1, inplace=True)

df.head()

df.drop(['tip'], axis=1, inplace=True)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


data = df


X = data[['size', 'sex', 'smoker', 'day_Fri', 'day_Sat', 'day_Sun', 'day_Thur', 'time_Dinner', 'time_Lunch']]
y = data['tip_ratio']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

model = LinearRegression()


model.fit(X_train, y_train)
y_pred = model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("mse", mse)
print("mae", mae)
print("r2", r2)

mse 0.005770406726776936
mae 0.038372554793093525
r2 -0.00193944292273307

mean_size = df['size'].mean()

# Print the mean size
print(mean_size)

2.569672131147541

mean_gender = df['tip_ratio'].mean()

# Print the mean size
print("mean_gender", mean_gender)

mean_gender 0.16080258172250478

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4
5	25.29	4.71	Male	No	Sun	Dinner	4
6	8.77	2.00	Male	No	Sun	Dinner	2
7	26.88	3.12	Male	No	Sun	Dinner	4
8	15.04	1.96	Male	No	Sun	Dinner	2
9	14.78	3.23	Male	No	Sun	Dinner	2
10	10.27	1.71	Male	No	Sun	Dinner	2
11	35.26	5.00	Female	No	Sun	Dinner	4
12	15.42	1.57	Male	No	Sun	Dinner	2
13	18.43	3.00	Male	No	Sun	Dinner	4
14	14.83	3.02	Female	No	Sun	Dinner	2

	tip	sex	size	day_Sun	time_Dinner	tip_ratio
0	1.01	0	2	1.0	1.0	0.059447
1	1.66	1	3	1.0	1.0	0.160542
2	3.50	1	3	1.0	1.0	0.166587
3	3.31	1	2	1.0	1.0	0.139780
4	3.61	0	4	1.0	1.0	0.146808