#Import the needed base libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

#Import encoders and scalers
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler 

#Import data set splitter
from sklearn.model_selection import train_test_split

# Import metrics module for performance evaluation
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import fowlkes_mallows_score

# Import metrics module for performance evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# Import Clustering models
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

# Import Classification models
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

#Set the panda dataframes to display all columns
pd.set_option('display.max_columns', None)
#Set the visual theme for matplot and seaborn plots
sns.set_theme()


df = pd.read_csv("used_cars_data.csv")
df.head(10)


df2 = df


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000040 entries, 0 to 3000039
Data columns (total 66 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   vin                      object 
 1   back_legroom             object 
 2   bed                      object 
 3   bed_height               object 
 4   bed_length               object 
 5   body_type                object 
 6   cabin                    object 
 7   city                     object 
 8   city_fuel_economy        float64
 9   combine_fuel_economy     float64
 10  daysonmarket             int64  
 11  dealer_zip               object 
 12  description              object 
 13  engine_cylinders         object 
 14  engine_displacement      float64
 15  engine_type              object 
 16  exterior_color           object 
 17  fleet                    object 
 18  frame_damaged            object 
 19  franchise_dealer         bool   
 20  franchise_make           object 
 21  front_legroom            object 
 22  fuel_tank_volume         object 
 23  fuel_type                object 
 24  has_accidents            object 
 25  height                   object 
 26  highway_fuel_economy     float64
 27  horsepower               float64
 28  interior_color           object 
 29  isCab                    object 
 30  is_certified             float64
 31  is_cpo                   object 
 32  is_new                   bool   
 33  is_oemcpo                object 
 34  latitude                 float64
 35  length                   object 
 36  listed_date              object 
 37  listing_color            object 
 38  listing_id               int64  
 39  longitude                float64
 40  main_picture_url         object 
 41  major_options            object 
 42  make_name                object 
 43  maximum_seating          object 
 44  mileage                  float64
 45  model_name               object 
 46  owner_count              float64
 47  power                    object 
 48  price                    float64
 49  salvage                  object 
 50  savings_amount           int64  
 51  seller_rating            float64
 52  sp_id                    float64
 53  sp_name                  object 
 54  theft_title              object 
 55  torque                   object 
 56  transmission             object 
 57  transmission_display     object 
 58  trimId                   object 
 59  trim_name                object 
 60  vehicle_damage_category  float64
 61  wheel_system             object 
 62  wheel_system_display     object 
 63  wheelbase                object 
 64  width                    object 
 65  year                     int64  
dtypes: bool(2), float64(14), int64(4), object(46)
memory usage: 1.4+ GB


df=df[['make_name','model_name', 'year', 'body_type', 'listing_color', 'mileage', 'is_new', 'owner_count', 'has_accidents', 'price', 'maximum_seating', 'horsepower', 'engine_type', 'city_fuel_economy', 'highway_fuel_economy', 'transmission']]
df.head(10)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000040 entries, 0 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   make_name             object 
 1   model_name            object 
 2   year                  int64  
 3   body_type             object 
 4   listing_color         object 
 5   mileage               float64
 6   is_new                bool   
 7   owner_count           float64
 8   has_accidents         object 
 9   price                 float64
 10  maximum_seating       object 
 11  horsepower            float64
 12  engine_type           object 
 13  city_fuel_economy     float64
 14  highway_fuel_economy  float64
 15  transmission          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 346.2+ MB


df.replace(' ', '_', regex=True)
df.columns = df.columns.str.replace('_',' ')
df.columns = df.columns.str.upper()
df.head(10)


#The indexes of cars with incomplete data for body type
incompleteBodyType = df[df['BODY TYPE'].isna()].index
#Drop the rows
df = df.drop(incompleteBodyType)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2986497 entries, 0 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 367.4+ MB


#The indexes of used cars with incomplete data
usedIncompleteData = df[(df['IS NEW']== False) & (df['OWNER COUNT'].isna())].index
#Drop the rows
df = df.drop(usedIncompleteData)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2940141 entries, 0 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 361.7+ MB


#The indexes of new cars with misleading data
newIncorrectData = df[(df['IS NEW']== True) & (df['OWNER COUNT'].notna())].index
#Drop the rows
df = df.drop(newIncorrectData)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2939178 entries, 0 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 361.6+ MB


#Fill the rest of the na values in owner count with 0.0
df['OWNER COUNT'].fillna(0.0,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2939178 entries, 0 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 361.6+ MB


#The indexes of used cars with incomplete data
usedIncompleteData = df[(df['IS NEW'] == False) & (df['MILEAGE'].isna())].index
#Drop the rows
df = df.drop(usedIncompleteData)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2929176 entries, 0 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 360.4+ MB


#The indexes of new cars with mileage over 200
newOver200 = df[(df['IS NEW'] == True) & (df['MILEAGE'].notna()) & (df['MILEAGE'] > 200)].index
#Drop the rows
df = df.drop(newOver200)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2870785 entries, 0 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 353.2+ MB


#The indexes of new cars with mileage over 200
newUnder200 = df[(df['IS NEW'] == True) & (df['MILEAGE'].notna()) & (df['MILEAGE'] < 200)].index
#Change the mileage to 0.0
df.loc[newUnder200,'MILEAGE'] = 0.0
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2870785 entries, 0 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 417.7+ MB


#The indexes of new cars with mileage na
newNAMileage = df[(df['IS NEW'] == True) & (df['MILEAGE'].isna())]
#Change the mileage to 0.0
df['MILEAGE'].fillna(0.0,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2870785 entries, 0 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 417.7+ MB


#The indexes of used cars with accidents na
usedAccidentsNA = df[(df['IS NEW'] == False) & (df['HAS ACCIDENTS'].isna())].index
#Drop the rows
df = df.drop(usedAccidentsNA)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2870785 entries, 0 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 417.7+ MB


#The indexes of new cars with accidents
newAccidents = df[(df['IS NEW'] == True) & (df['HAS ACCIDENTS'] == True)].index
#Drop the rows
df = df.drop(newAccidents)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2870782 entries, 0 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 353.2+ MB


#The indexes of new cars with accidents na
newAccidentsNA = df[(df['IS NEW'] == True) & (df['HAS ACCIDENTS'].isna())].index
#Change the accidents to False
df.loc[newAccidentsNA,'HAS ACCIDENTS'] = False
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2870782 entries, 0 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 417.7+ MB


#The indexes of the rows with maximum seating na
seatingNA = df[df['MAXIMUM SEATING'].isna()].index
#Drop the rows
df = df.drop(seatingNA)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2733501 entries, 0 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 336.3+ MB


#The indexes of the rows with engine type na
engineNA = df[df['ENGINE TYPE'].isna()].index
#Drop the rows
df = df.drop(engineNA)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2702202 entries, 0 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 332.4+ MB


#The indexes of the rows with horsepower na
horseNA = df[df['HORSEPOWER'].isna()].index
#Drop the rows
df = df.drop(horseNA)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2685425 entries, 0 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 330.4+ MB


#The indexes of the rows with transmission na
transNA = df[df['TRANSMISSION'].isna()].index
#Drop the rows
df = df.drop(transNA)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2635664 entries, 0 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 324.2+ MB


#The indexes of the rows with fuel eco city na
cityNA = df[df['CITY FUEL ECONOMY'].isna()].index
#Drop the rows
df = df.drop(cityNA)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2357387 entries, 12 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 290.0+ MB


#The indexes of the rows with fuel eco highway na
highwayNA = df[df['HIGHWAY FUEL ECONOMY'].isna()].index
#Drop the rows
df = df.drop(highwayNA)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2357387 entries, 12 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 354.5+ MB


#The indexes of the rows with UNKNOWN color
colorNA = df[df['LISTING COLOR'] == "UNKNOWN"].index
#Drop the rows
df = df.drop(colorNA)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2033481 entries, 12 to 3000039
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MAKE NAME             object 
 1   MODEL NAME            object 
 2   YEAR                  int64  
 3   BODY TYPE             object 
 4   LISTING COLOR         object 
 5   MILEAGE               float64
 6   IS NEW                bool   
 7   OWNER COUNT           float64
 8   HAS ACCIDENTS         object 
 9   PRICE                 float64
 10  MAXIMUM SEATING       object 
 11  HORSEPOWER            float64
 12  ENGINE TYPE           object 
 13  CITY FUEL ECONOMY     float64
 14  HIGHWAY FUEL ECONOMY  float64
 15  TRANSMISSION          object 
dtypes: bool(1), float64(6), int64(1), object(8)
memory usage: 250.2+ MB


from sklearn.preprocessing import OrdinalEncoder

order_encoder_body=OrdinalEncoder()

df['BODY TYPE ENCODED'] = order_encoder_body.fit_transform(df['BODY TYPE'].to_numpy().reshape(-1, 1))

order_encoder_engine=OrdinalEncoder()

df['ENGINE TYPE ENCODED'] = order_encoder_engine.fit_transform(df['ENGINE TYPE'].to_numpy().reshape(-1, 1))

df.head(10)


df = df.replace(regex=[' seats'], value='')
df['MAXIMUM SEATING'] = pd.to_numeric(df['MAXIMUM SEATING'],errors = 'coerce')


#The indexes of the rows with seating as na
seatingNa = df[df['MAXIMUM SEATING'].isna()].index
#Drop the rows
df = df.drop(seatingNa)


from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

df_cluster = df[['CITY FUEL ECONOMY', 'HIGHWAY FUEL ECONOMY', 'MAXIMUM SEATING', 'ENGINE TYPE ENCODED', 'HORSEPOWER']]

pca_model = PCA()
df_cluster_pca = pca_model.fit_transform(df_cluster)

pca_df_cluster = pd.DataFrame(df_cluster_pca, columns=['PC1','PC2', 'PC3', 'PC3', 'PC4'])
df = pd.merge(df, pca_df_cluster, right_index=True, left_index=True)
df.head()


print('Explained Variance Ratio')
for i in range(5):
    print('PC{}: {}'.format(i+1,pca_model.explained_variance_ratio_[i]))

Explained Variance Ratio
PC1: 0.9915175225698315
PC2: 0.005320779761254336
PC3: 0.002606617911790896
PC4: 0.00042965830278262663
PC5: 0.00012542145434066125


pca = PCA()
pca.fit(df_cluster)

cumsum = np.cumsum(pca.explained_variance_ratio_)
dim = np.arange(len(cumsum))+1

plt.plot(dim,cumsum,'-',lw=3);

plt.xlabel('Dimensions')
plt.ylabel('Variance Explained')
plt.title('Selecting the right number of dimensions')

plt.xticks([1,2,3,4,5])
plt.ylim([0,1.1]);


variance_explained = .95

pca = PCA(n_components = variance_explained)

pca_data = pca.fit_transform(df_cluster)
print("{} Principal Components are required to explain {} of the variation in this data.".format(pca.n_components_, variance_explained))

1 Principal Components are required to explain 0.95 of the variation in this data.


sns.stripplot(x="PC1", y="BODY TYPE", data=df)
plt.title('Used Cars Visualized in One Dimension')

Text(0.5, 1.0, 'Used Cars Visualized in One Dimension')


df_mask = df['MODEL NAME']  == 'F-150'
df_150 = df[df_mask]
df_150_PC = df_150[['PC1']]

df_all_PC = df[['PC1']]
df_150.head()


from sklearn.cluster import KMeans
#nine different body types
K = 9

km = KMeans(n_clusters = K)
km.fit(df_all_PC)

predictions = km.predict(df_150_PC)
print(predictions)

[2 7 3 ... 7 2 2]


feature_set = ['CITY FUEL ECONOMY', 'HIGHWAY FUEL ECONOMY', 'MAXIMUM SEATING', 'ENGINE TYPE ENCODED', 'HORSEPOWER']
features = df[feature_set] 
target = df['BODY TYPE ENCODED']
 
from sklearn.model_selection import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.3, random_state=1)


from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import fowlkes_mallows_score

num_clusters = 9
 
km = KMeans(n_clusters=num_clusters)
km.fit(feature_train)
 
predictions=km.predict(feature_test)
 
print("Davies-Bouldin Index:", davies_bouldin_score(feature_test, predictions))
print("Silhouette Coefficient:", silhouette_score(feature_test, predictions))
print("Adjusted Rand Score:", adjusted_rand_score(target_test, predictions))
print("Fowlkes Mallows Score:", fowlkes_mallows_score(target_test, predictions))

Davies-Bouldin Index: 0.6381341489188128
Silhouette Coefficient: 0.4984742399332719
Adjusted Rand Score: 1.2671855627134967
Fowlkes Mallows Score: 0.290885003158871


features = df[['CITY FUEL ECONOMY', 'HIGHWAY FUEL ECONOMY', 'MAXIMUM SEATING', 'ENGINE TYPE ENCODED', 'HORSEPOWER']]
target = df['BODY TYPE ENCODED']

from sklearn.model_selection import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.3, random_state=1)
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

clf = clf.fit(feature_train,target_train)

predictions = clf.predict(feature_test)
print(predictions)

[3. 5. 5. ... 5. 5. 5.]


from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

print("Accuracy:",accuracy_score(target_test, predictions))
print("Precision:",precision_score(target_test, predictions, average = 'macro'))
print("Recall:",recall_score(target_test, predictions, average = 'macro'))
print("F1-Score:",f1_score(target_test, predictions, average = 'macro'))

Accuracy: 0.9787284341348567
Precision: 0.9499259306379962
Recall: 0.9095862490780893
F1-Score: 0.9280160189269451


#The used cars
used = df[df['IS NEW'] == False]

#The used cars under 50000
used = used[used['PRICE'] <= 50000]

#Bin labels
bin_labels = [1, 2, 3, 4, 5]
bins = [0,10000,20000,30000,40000,50000]

#Using pd cut to bin the price data
used['PRICE RANGE'] = pd.cut(used['PRICE'], bins = bins, labels=bin_labels)

# Instantiate the Label Encoder Object
label_encoder = LabelEncoder()
# Fit and transform the column
used['MAKE NAME ENCODED'] = label_encoder.fit_transform(used['MAKE NAME'])
used['MODEL NAME ENCODED'] = label_encoder.fit_transform(used['MODEL NAME'])

#Filter the desired columns
used = used[['MAKE NAME','MAKE NAME ENCODED','MODEL NAME','MODEL NAME ENCODED','YEAR','MILEAGE', 'CITY FUEL ECONOMY', 'HIGHWAY FUEL ECONOMY','PRICE','PRICE RANGE']]


#Split the dataset in two parts: feature set and target label 
feature_set = used[['MAKE NAME ENCODED','MODEL NAME ENCODED','YEAR','MILEAGE', 'CITY FUEL ECONOMY', 'HIGHWAY FUEL ECONOMY']]
features = feature_set
target = used['PRICE RANGE']
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.3, random_state=1)


#Specify the number of clusters
num_clusters = 5
 
#Create and fit the KMeans model
km = KMeans(n_clusters=num_clusters)
km.fit(feature_train)
 
#Predict the target variable
predictions=km.predict(feature_test)

#Calculate internal performance evaluation measures
print("Davies-Bouldin Index:", davies_bouldin_score(feature_test, predictions))
print("Silhouette Coefficient:", silhouette_score(feature_test, predictions))

Davies-Bouldin Index: 0.5661549672831206
Silhouette Coefficient: 0.5327719692166731


#Create a Decision Tree classifier object
clf = DecisionTreeClassifier()
 
#Train the model using training dataset
clf = clf.fit(feature_train,target_train)
 
# Predict the response for test dataset
predictions = clf.predict(feature_test)
 
#Calculate model accuracy
print("Accuracy:",accuracy_score(target_test, predictions))
#Calculate model precision
print("Precision:",precision_score(target_test, predictions, average='micro'))
#Calculate model recall
print("Recall:",recall_score(target_test, predictions, average='micro'))
#Calculate model f1 score
print("F1-Score:",f1_score(target_test, predictions, average='micro'))

Accuracy: 0.7776631357936805
Precision: 0.7776631357936805
Recall: 0.7776631357936805
F1-Score: 0.7776631357936805


#Create a KNN classifier object
model = KNeighborsClassifier(n_neighbors=5)
 
#Train the model using the training dataset
model.fit(feature_train,target_train)
 
#Predict the target variable for test dataset
predictions = model.predict(feature_test)

#Calculate model accuracy
print("Accuracy:",accuracy_score(target_test, predictions))
#Calculate model precision
print("Precision:",precision_score(target_test, predictions, average='micro'))
#Calculate model recall
print("Recall:",recall_score(target_test, predictions, average='micro'))
#Calculate model f1 score
print("F1-Score:",f1_score(target_test, predictions, average='micro'))

Accuracy: 0.5766331601688691
Precision: 0.5766331601688691
Recall: 0.5766331601688691
F1-Score: 0.5766331601688691


#Initialize the StandardScaler 
scaler = StandardScaler() 
  
#Scale the mileage data
scaler.fit(used['MILEAGE'].values.reshape(-1,1))  
used['MILEAGE STANDARDIZED'] = scaler.transform(used['MILEAGE'].values.reshape(-1,1))

scaler.fit(used['CITY FUEL ECONOMY'].values.reshape(-1,1))  
used['CITY FUEL ECONOMY STANDARDIZED'] = scaler.transform(used['CITY FUEL ECONOMY'].values.reshape(-1,1))

scaler.fit(used['HIGHWAY FUEL ECONOMY'].values.reshape(-1,1))  
used['HIGHWAY FUEL ECONOMY STANDARDIZED'] = scaler.transform(used['HIGHWAY FUEL ECONOMY'].values.reshape(-1,1))

#Split the dataset in two parts: feature set and target label 
feature_set = used[['MAKE NAME ENCODED','MODEL NAME ENCODED','YEAR','MILEAGE STANDARDIZED', 'CITY FUEL ECONOMY STANDARDIZED', 'HIGHWAY FUEL ECONOMY STANDARDIZED']]
features = feature_set
target = used['PRICE RANGE']
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.3, random_state=1)


#Create a KNN classifier object
model = KNeighborsClassifier(n_neighbors=5)
 
#Train the model using the training dataset
model.fit(feature_train,target_train)
 
#Predict the target variable for test dataset
predictions = model.predict(feature_test)

#Calculate model accuracy
print("Accuracy:",accuracy_score(target_test, predictions))
#Calculate model precision
print("Precision:",precision_score(target_test, predictions, average='micro'))
#Calculate model recall
print("Recall:",recall_score(target_test, predictions, average='micro'))
#Calculate model f1 score
print("F1-Score:",f1_score(target_test, predictions, average='micro'))

Accuracy: 0.8263639517542476
Precision: 0.8263639517542476
Recall: 0.8263639517542476
F1-Score: 0.8263639517542476


dfMakePrice = df.groupby('MAKE NAME').mean()['PRICE']
dfMakePrice = dfMakePrice.sort_values(ascending = False)
dfMakePriceTop10 = dfMakePrice.head(10)
dfMakePriceTop10 = dfMakePriceTop10.reset_index()
dfMakePriceTop10 = dfMakePriceTop10.head(10)

import plotly.express as px

fig = px.bar(dfMakePriceTop10, x='MAKE NAME', y='PRICE',                  
             labels={
                     "PRICE": "Average Price",
                     "MAKE NAME": "Make"
                 },
                title="Top 10 Average Prices by Make")
fig.show()


df_damaged = df[(df['HAS ACCIDENTS'] == True)]

df_not = df[(df['HAS ACCIDENTS'] == False)]

plt.figure(figsize=(15, 10))

plt.scatter(x = df_not['PRICE'] , y = df_not['MILEAGE'], s=1, color='blue')
plt.scatter(x = df_damaged['PRICE'] , y = df_damaged['MILEAGE'], s=2, color='red')

plt.xlabel("PRICE")
plt.ylabel("MILEAGE")
plt.xlim((0, 500000))
plt.legend(['Not Damaged', 'Damaged'], loc='upper right')

plt.show()


df['LISTING COLOR'].value_counts()

BLACK     326776
WHITE     318261
SILVER    209393
GRAY      201712
BLUE      141911
RED       140168
GREEN      13208
BROWN      12099
ORANGE      6407
GOLD        5568
TEAL        3175
YELLOW      2260
PURPLE       836
PINK          39
Name: LISTING COLOR, dtype: int64


plt.figure(figsize=(15, 5))
ax = sns.boxplot(x = 'LISTING COLOR', y = 'PRICE', data=df)
ax.set_title('PRICE by COLOR')

Text(0.5, 1.0, 'PRICE by COLOR')


# Drop the outlier observations using Percentiles
upper_limit = df['PRICE'].quantile(.999)
dfDrop = df[(df['PRICE'] < upper_limit)]


plt.figure(figsize=(15, 5))
ax = sns.boxplot(x = 'LISTING COLOR', y = 'PRICE', data=dfDrop)
ax.set_title('PRICE by COLOR')

Text(0.5, 1.0, 'PRICE by COLOR')


make_count = df['MAKE NAME'].value_counts()
unique_makes = df['MAKE NAME'].unique()
make_count = make_count.to_frame()
make_count = make_count.reset_index()
make_count = make_count.rename({'index': 'MAKE', 'MAKE NAME': 'COUNT'}, axis='columns')
make_count_top24 = make_count.head(24)
make_count_rest = make_count.iloc[24:]
other_total = make_count_rest['COUNT'].sum()
other = {'MAKE': 'Other', 'COUNT': other_total}
make_count_top25 = make_count_top24.append(other, ignore_index = True)

plt.figure(figsize=(15, 15))

plt.pie(make_count_top25['COUNT'], labels = make_count_top25['MAKE'],
        shadow= True,
        autopct='%.1f%%')
plt.title('Percentage of Different Car Makes')
plt.axis('equal')
plt.show()


plt.figure(figsize=(20, 10))
ax = sns.violinplot(x = 'HAS ACCIDENTS', y = 'PRICE', data=dfDrop);
ax.set_title('Comparing price vs accident status')
ax.set_ylim((-10000, 150000))

(-10000.0, 150000.0)


plt.figure(figsize=(20, 10))
ax = sns.violinplot(x = 'HAS ACCIDENTS', y = 'MILEAGE', data=dfDrop);
ax.set_title('Comparing mileage vs accident status')

Text(0.5, 1.0, 'Comparing mileage vs accident status')


dfTop500Mile = df.sort_values(by=["MILEAGE"], ascending = False).head(500)
count = dfTop500Mile['MAKE NAME'].value_counts().head(10)
unq = count.index.unique()
plt.figure(figsize=(12, 6))
ax = sns.swarmplot(x = 'MAKE NAME', y = 'MILEAGE', data=dfTop500Mile, hue='HAS ACCIDENTS', order = unq)
plt.legend(loc='upper right', title='HAS ACCIDENTS')
plt.title('Top 10 Makes of the top 500 cars with the most mileage')

Text(0.5, 1.0, 'Top 10 Makes of the top 500 cars with the most mileage')


with sns.axes_style('white'):
    plot = sns.catplot(x = 'YEAR', data = df, aspect=1.5, kind = 'count',
                   hue = 'TRANSMISSION', height = 10)
    plot.set_ylabels('Number of cars')
plt.show()


# Obviously skewed data
with sns.axes_style('white'):
    plot = sns.jointplot(x = 'PRICE', y = 'MILEAGE', data = dfDrop, kind = 'hex', height = 10, xlim = (-2500, 200000), ylim = (-5000, 220000))


dfDrop['MILEAGE FEATURE TRANSFORMATION'] = np.cbrt(dfDrop['MILEAGE'])

dfDrop['PRICE FEATURE TRANSFORMATION'] = np.sqrt(dfDrop['PRICE'])

with sns.axes_style('white'):
    plot = sns.jointplot(x = 'PRICE FEATURE TRANSFORMATION', y = 'MILEAGE FEATURE TRANSFORMATION', data = dfDrop, kind = 'hex', height = 10, xlim = (0, 300))
plot.set_axis_labels(ylabel = 'Transformed Mileage', xlabel = 'Transformed Price')

<seaborn.axisgrid.JointGrid at 0x272d1b2e580>


ppdf = df[['HORSEPOWER', 'CITY FUEL ECONOMY', 'HIGHWAY FUEL ECONOMY', 'MILEAGE', 'HAS ACCIDENTS']]
sns.pairplot(ppdf, hue='HAS ACCIDENTS', height=2.5);


#Set fig Size
plt.figure(figsize=(20, 8))
#Slant Labels
plt.xticks(rotation=45)
#Plot
sns.scatterplot(data=df, x="MAKE NAME", y="ENGINE TYPE",hue='TRANSMISSION')

<AxesSubplot:xlabel='MAKE NAME', ylabel='ENGINE TYPE'>


#The used cars
pair = df[df['IS NEW'] == False]
#Used cars under 50,000
pair = pair[pair['PRICE'] <= 50000]

#Using pd qcut to bin the price data this cuts the data into equal bins by using iq ranges
pair['PRICE RANGE'] = pd.qcut(pair['PRICE'],q=5)

#Filter the desired columns
pair = pair[['YEAR','MILEAGE','PRICE RANGE']]
sns.pairplot(pair,hue='PRICE RANGE', height=5.5)

<seaborn.axisgrid.PairGrid at 0x273a83ca640>

	vin	back_legroom	bed	bed_height	bed_length	body_type	cabin	city	city_fuel_economy	combine_fuel_economy	daysonmarket	dealer_zip	description	engine_cylinders	engine_displacement	engine_type	exterior_color	fleet	frame_damaged	franchise_dealer	franchise_make	front_legroom	fuel_tank_volume	fuel_type	has_accidents	height	highway_fuel_economy	horsepower	interior_color	isCab	is_certified	is_cpo	is_new	is_oemcpo	latitude	length	listed_date	listing_color	listing_id	longitude	main_picture_url	major_options	make_name	maximum_seating	mileage	model_name	owner_count	power	price	salvage	seller_rating	sp_id	sp_name	theft_title	torque	transmission	transmission_display	trimId	trim_name	vehicle_damage_category	wheel_system	wheel_system_display	wheelbase	width	year
0	ZACNJABB5KPJ92081	35.1 in	NaN	NaN	NaN	SUV / Crossover	NaN	Bayamon	NaN	NaN	522	960	[!@@Additional Info@@!]Engine: 2.4L I4 ZERO EV...	I4	1300.0	I4	Solar Yellow	NaN	NaN	True	Jeep	41.2 in	12.7 gal	Gasoline	NaN	66.5 in	NaN	177.0	Black	NaN	NaN	NaN	True	NaN	18.3988	166.6 in	2019-04-06	YELLOW	237132766	-66.1582	https://static.cargurus.com/images/forsale/202...	['Quick Order Package']	Jeep	5 seats	7.0	Renegade	NaN	177 hp @ 5,750 RPM	23141.0	NaN	2.8	370599.0	Flagship Chrysler	NaN	200 lb-ft @ 1,750 RPM	A	9-Speed Automatic Overdrive	t83804	Latitude FWD	NaN	FWD	Front-Wheel Drive	101.2 in	79.6 in	2019
1	SALCJ2FX1LH858117	38.1 in	NaN	NaN	NaN	SUV / Crossover	NaN	San Juan	NaN	NaN	207	922	[!@@Additional Info@@!]Keyless Entry,Ebony Mor...	I4	2000.0	I4	Narvik Black	NaN	NaN	True	Land Rover	39.1 in	17.7 gal	Gasoline	NaN	68 in	NaN	246.0	Black (Ebony)	NaN	NaN	NaN	True	NaN	18.4439	181 in	2020-02-15	BLACK	265946296	-66.0785	https://static.cargurus.com/images/forsale/202...	['Adaptive Cruise Control']	Land Rover	7 seats	8.0	Discovery Sport	NaN	246 hp @ 5,500 RPM	46500.0	NaN	3.0	389227.0	Land Rover San Juan	NaN	269 lb-ft @ 1,400 RPM	A	9-Speed Automatic Overdrive	t86759	S AWD	NaN	AWD	All-Wheel Drive	107.9 in	85.6 in	2020
2	JF1VA2M67G9829723	35.4 in	NaN	NaN	NaN	Sedan	NaN	Guaynabo	17.0	NaN	1233	969	NaN	H4	2500.0	H4	None	False	False	True	FIAT	43.3 in	15.9 gal	Gasoline	False	58.1 in	23.0	305.0	None	False	NaN	NaN	False	NaN	18.3467	180.9 in	2017-04-25	UNKNOWN	173473508	-66.1098	NaN	['Alloy Wheels', 'Bluetooth', 'Backup Camera',...	Subaru	5 seats	NaN	WRX STI	3.0	305 hp @ 6,000 RPM	46995.0	False	NaN	370467.0	FIAT de San Juan	False	290 lb-ft @ 4,000 RPM	M	6-Speed Manual	t58994	Base	NaN	AWD	All-Wheel Drive	104.3 in	78.9 in	2016
3	SALRR2RV0L2433391	37.6 in	NaN	NaN	NaN	SUV / Crossover	NaN	San Juan	NaN	NaN	196	922	[!@@Additional Info@@!]Fog Lights,7 Seat Packa...	V6	3000.0	V6	Eiger Gray	NaN	NaN	True	Land Rover	39 in	23.5 gal	Gasoline	NaN	73 in	NaN	340.0	Gray (Ebony/Ebony/Ebony)	NaN	NaN	NaN	True	NaN	18.4439	195.1 in	2020-02-26	GRAY	266911050	-66.0785	https://static.cargurus.com/images/forsale/202...	NaN	Land Rover	7 seats	11.0	Discovery	NaN	340 hp @ 6,500 RPM	67430.0	NaN	3.0	389227.0	Land Rover San Juan	NaN	332 lb-ft @ 3,500 RPM	A	8-Speed Automatic Overdrive	t86074	V6 HSE AWD	NaN	AWD	All-Wheel Drive	115 in	87.4 in	2020
4	SALCJ2FXXLH862327	38.1 in	NaN	NaN	NaN	SUV / Crossover	NaN	San Juan	NaN	NaN	137	922	[!@@Additional Info@@!]Keyless Entry,Ebony Mor...	I4	2000.0	I4	Narvik Black	NaN	NaN	True	Land Rover	39.1 in	17.7 gal	Gasoline	NaN	68 in	NaN	246.0	Black (Ebony)	NaN	NaN	NaN	True	NaN	18.4439	181 in	2020-04-25	BLACK	270957414	-66.0785	https://static.cargurus.com/images/forsale/202...	['Adaptive Cruise Control']	Land Rover	7 seats	7.0	Discovery Sport	NaN	246 hp @ 5,500 RPM	48880.0	NaN	3.0	389227.0	Land Rover San Juan	NaN	269 lb-ft @ 1,400 RPM	A	9-Speed Automatic Overdrive	t86759	S AWD	NaN	AWD	All-Wheel Drive	107.9 in	85.6 in	2020
5	SALYK2EX1LA261711	37.1 in	NaN	NaN	NaN	SUV / Crossover	NaN	San Juan	NaN	NaN	242	922	[!@@Additional Info@@!]Tires: 21' All-Season,A...	I4	2000.0	I4	Kaikoura Stone	False	False	True	Land Rover	40.2 in	16.6 gal	Gasoline	False	66.3 in	NaN	247.0	Brown (Ebony / Ebony)	False	NaN	NaN	True	NaN	18.4439	188.9 in	2020-01-11	UNKNOWN	262940541	-66.0785	https://static.cargurus.com/images/forsale/202...	['Leather Seats', 'Sunroof/Moonroof', 'Navigat...	Land Rover	5 seats	12.0	Range Rover Velar	NaN	247 hp @ 5,500 RPM	66903.0	False	3.0	389227.0	Land Rover San Juan	False	269 lb-ft @ 1,200 RPM	A	8-Speed Automatic Overdrive	t85614	P250 R-Dynamic S AWD	NaN	AWD	All-Wheel Drive	113.1 in	84.4 in	2020
6	3MZBPABL6KM107908	35.1 in	NaN	NaN	NaN	Sedan	NaN	Bayamon	NaN	NaN	447	960	[!@@Additional Info@@!]4-Wheel Disc Brakes,A/C...	I4	2500.0	I4	SONIC SILVER	NaN	NaN	True	Jeep	42.3 in	13.2 gal	Gasoline	NaN	56.9 in	NaN	186.0	Black	NaN	NaN	NaN	True	NaN	18.3988	183.5 in	2019-06-20	SILVER	244110426	-66.1582	https://static.cargurus.com/images/forsale/201...	['Alloy Wheels', 'Bluetooth', 'Backup Camera']	Mazda	5 seats	14.0	MAZDA3	NaN	186 hp @ 6,000 RPM	23695.0	NaN	2.8	370599.0	Flagship Chrysler	NaN	186 lb-ft @ 4,000 RPM	A	6-Speed Automatic Overdrive	t85256	Sedan FWD	NaN	FWD	Front-Wheel Drive	107.3 in	70.7 in	2019
7	SALYK2EX5LA275434	37.1 in	NaN	NaN	NaN	SUV / Crossover	NaN	San Juan	NaN	NaN	70	922	[!@@Additional Info@@!]Tires: 21' All-Season,A...	I4	2000.0	I4	Fuji White	NaN	NaN	True	Land Rover	40.2 in	16.6 gal	Gasoline	NaN	66.3 in	NaN	247.0	White (Eclipse / Ebony / Ebony)	NaN	NaN	NaN	True	NaN	18.4439	188.9 in	2020-07-01	WHITE	275458784	-66.0785	https://static.cargurus.com/images/forsale/202...	['Adaptive Cruise Control']	Land Rover	5 seats	11.0	Range Rover Velar	NaN	247 hp @ 5,500 RPM	68520.0	NaN	3.0	389227.0	Land Rover San Juan	NaN	269 lb-ft @ 1,200 RPM	A	8-Speed Automatic Overdrive	t85614	P250 R-Dynamic S AWD	NaN	AWD	All-Wheel Drive	113.1 in	84.4 in	2020
8	SALCJ2FX6LH858128	38.1 in	NaN	NaN	NaN	SUV / Crossover	NaN	San Juan	NaN	NaN	196	922	[!@@Additional Info@@!]Keyless Entry,Chrome Wh...	I4	2000.0	I4	Eiger Gray	NaN	NaN	True	Land Rover	39.1 in	17.7 gal	Gasoline	NaN	68 in	NaN	246.0	Black (Ebony)	NaN	NaN	NaN	True	NaN	18.4439	181 in	2020-02-26	GRAY	266911040	-66.0785	https://static.cargurus.com/images/forsale/202...	['Navigation System', 'Backup Camera', 'Blind ...	Land Rover	7 seats	8.0	Discovery Sport	NaN	246 hp @ 5,500 RPM	51245.0	NaN	3.0	389227.0	Land Rover San Juan	NaN	269 lb-ft @ 1,400 RPM	A	9-Speed Automatic Overdrive	t86759	S AWD	NaN	AWD	All-Wheel Drive	107.9 in	85.6 in	2020
9	SALZL2GX4LH007593	33.8 in	NaN	NaN	NaN	SUV / Crossover	NaN	San Juan	NaN	NaN	510	922	[!@@Additional Info@@!]4-Wheel Disc Brakes,A/C...	I4	2000.0	I4	Blanco	False	False	True	Land Rover	40 in	17.7 gal	Gasoline	False	64.9 in	NaN	296.0	Eclipse/Ebony	False	NaN	NaN	False	NaN	18.4439	172.1 in	2019-04-18	WHITE	238225156	-66.0785	https://static.cargurus.com/images/forsale/202...	['Leather Seats', 'Navigation System', 'Adapti...	Land Rover	5 seats	254.0	Range Rover Evoque	NaN	296 hp @ 5,500 RPM	84399.0	False	3.0	389227.0	Land Rover San Juan	False	295 lb-ft @ 1,600 RPM	A	9-Speed Automatic Overdrive	t85531	P300 R-Dynamic SE AWD	NaN	AWD	All-Wheel Drive	105.6 in	82.7 in	2020

	MAKE NAME	MODEL NAME	YEAR	BODY TYPE	LISTING COLOR	MILEAGE	IS NEW	OWNER COUNT	HAS ACCIDENTS	PRICE	MAXIMUM SEATING	HORSEPOWER	ENGINE TYPE	CITY FUEL ECONOMY	HIGHWAY FUEL ECONOMY	TRANSMISSION
0	Jeep	Renegade	2019	SUV / Crossover	YELLOW	7.0	True	NaN	NaN	23141.0	5 seats	177.0	I4	NaN	NaN	A
1	Land Rover	Discovery Sport	2020	SUV / Crossover	BLACK	8.0	True	NaN	NaN	46500.0	7 seats	246.0	I4	NaN	NaN	A
2	Subaru	WRX STI	2016	Sedan	UNKNOWN	NaN	False	3.0	False	46995.0	5 seats	305.0	H4	17.0	23.0	M
3	Land Rover	Discovery	2020	SUV / Crossover	GRAY	11.0	True	NaN	NaN	67430.0	7 seats	340.0	V6	NaN	NaN	A
4	Land Rover	Discovery Sport	2020	SUV / Crossover	BLACK	7.0	True	NaN	NaN	48880.0	7 seats	246.0	I4	NaN	NaN	A
5	Land Rover	Range Rover Velar	2020	SUV / Crossover	UNKNOWN	12.0	True	NaN	False	66903.0	5 seats	247.0	I4	NaN	NaN	A
6	Mazda	MAZDA3	2019	Sedan	SILVER	14.0	True	NaN	NaN	23695.0	5 seats	186.0	I4	NaN	NaN	A
7	Land Rover	Range Rover Velar	2020	SUV / Crossover	WHITE	11.0	True	NaN	NaN	68520.0	5 seats	247.0	I4	NaN	NaN	A
8	Land Rover	Discovery Sport	2020	SUV / Crossover	GRAY	8.0	True	NaN	NaN	51245.0	7 seats	246.0	I4	NaN	NaN	A
9	Land Rover	Range Rover Evoque	2020	SUV / Crossover	WHITE	254.0	False	NaN	False	84399.0	5 seats	296.0	I4	NaN	NaN	A

	MAKE NAME	MODEL NAME	YEAR	BODY TYPE	LISTING COLOR	MILEAGE	IS NEW	OWNER COUNT	HAS ACCIDENTS	PRICE	MAXIMUM SEATING	HORSEPOWER	ENGINE TYPE	CITY FUEL ECONOMY	HIGHWAY FUEL ECONOMY	TRANSMISSION	BODY TYPE ENCODED	ENGINE TYPE ENCODED
12	BMW	3 Series	2016	Sedan	SILVER	6903.0	False	2.0	False	58995.0	5 seats	320.0	I6	22.0	33.0	A	6.0	13.0
38	Chevrolet	Malibu	2018	Sedan	SILVER	42394.0	False	1.0	False	14639.0	5 seats	160.0	I4	27.0	36.0	A	6.0	6.0
40	Lexus	RC 350	2018	Coupe	BLACK	62251.0	False	1.0	False	32000.0	4 seats	311.0	V6	18.0	24.0	A	1.0	20.0
41	Chevrolet	Traverse	2018	SUV / Crossover	SILVER	36410.0	False	1.0	False	23723.0	8 seats	310.0	V6	18.0	27.0	A	5.0	20.0
45	Chevrolet	Traverse	2017	SUV / Crossover	SILVER	36055.0	False	1.0	True	22422.0	8 seats	281.0	V6	15.0	22.0	A	5.0	20.0
47	Jeep	Grand Cherokee	2018	SUV / Crossover	BLACK	25745.0	False	1.0	False	29424.0	5 seats	295.0	V6	18.0	25.0	A	5.0	20.0
49	Jeep	Compass	2017	SUV / Crossover	BLACK	14607.0	False	1.0	False	17000.0	5 seats	172.0	I4	20.0	25.0	A	5.0	6.0
51	Hyundai	Veloster	2020	Coupe	RED	3073.0	False	1.0	False	16900.0	4 seats	147.0	I4	27.0	34.0	A	1.0	6.0
53	Jeep	Grand Cherokee	2017	SUV / Crossover	SILVER	16467.0	False	1.0	False	25500.0	5 seats	241.0	V6	18.0	25.0	A	5.0	20.0
54	Chevrolet	Traverse	2018	SUV / Crossover	WHITE	37536.0	False	1.0	False	23939.0	8 seats	310.0	V6	18.0	27.0	A	5.0	20.0

	MAKE NAME	MODEL NAME	YEAR	BODY TYPE	LISTING COLOR	MILEAGE	IS NEW	OWNER COUNT	HAS ACCIDENTS	PRICE	MAXIMUM SEATING	HORSEPOWER	ENGINE TYPE	CITY FUEL ECONOMY	HIGHWAY FUEL ECONOMY	TRANSMISSION	BODY TYPE ENCODED	ENGINE TYPE ENCODED	PC1	PC2	PC3	PC3	PC4
12	BMW	3 Series	2016	Sedan	SILVER	6903.0	False	2.0	False	58995.0	5.0	320.0	I6	22.0	33.0	A	6.0	13.0	-0.065989	-8.400893	-5.117548	-0.336206	-0.941181
38	Chevrolet	Malibu	2018	Sedan	SILVER	42394.0	False	1.0	False	14639.0	5.0	160.0	I4	27.0	36.0	A	6.0	6.0	178.779645	1.507774	-0.775535	0.440298	1.815928
40	Lexus	RC 350	2018	Coupe	BLACK	62251.0	False	1.0	False	32000.0	4.0	311.0	V6	18.0	24.0	A	1.0	20.0	40.029696	-8.739841	-1.883397	0.018708	1.997441
41	Chevrolet	Traverse	2018	SUV / Crossover	SILVER	36410.0	False	1.0	False	23723.0	8.0	310.0	V6	18.0	27.0	A	5.0	20.0	5.118766	6.317278	4.963830	-1.261450	-1.000357
45	Chevrolet	Traverse	2017	SUV / Crossover	SILVER	36055.0	False	1.0	True	22422.0	8.0	281.0	V6	15.0	22.0	A	5.0	20.0	-69.907054	-8.019258	3.903574	1.197020	-0.350823

	MAKE NAME	MODEL NAME	YEAR	BODY TYPE	LISTING COLOR	MILEAGE	IS NEW	OWNER COUNT	HAS ACCIDENTS	PRICE	MAXIMUM SEATING	HORSEPOWER	ENGINE TYPE	CITY FUEL ECONOMY	HIGHWAY FUEL ECONOMY	TRANSMISSION	BODY TYPE ENCODED	ENGINE TYPE ENCODED	PC1	PC2	PC3	PC3	PC4
381	Ford	F-150	2001	Pickup Truck	GRAY	173122.0	False	2.0	False	3499.0	6.0	260.0	V8	12.0	16.0	A	4.0	26.0	-136.500942	-1.618505	-3.922177	1.227352	-0.148023
477	Ford	F-150	2016	Pickup Truck	BLACK	32168.0	False	2.0	False	22118.0	3.0	282.0	V6 Flex Fuel Vehicle	17.0	23.0	A	4.0	24.0	-75.814952	-9.296057	3.734391	2.014478	1.590737
496	Ford	F-150	2015	Pickup Truck	BLACK	66211.0	False	2.0	False	35965.0	5.0	365.0	V6	17.0	23.0	A	4.0	20.0	40.029696	-8.739841	-1.883397	0.018708	1.997441
551	Ford	F-150	2018	Pickup Truck	WHITE	32299.0	False	1.0	False	22685.0	3.0	290.0	V6 Flex Fuel Vehicle	20.0	25.0	A	4.0	24.0	67.234743	2.306615	17.625948	2.654663	-0.075816
1339	Ford	F-150	2016	Pickup Truck	RED	29441.0	False	1.0	False	42742.0	6.0	365.0	V6	16.0	22.0	A	4.0	20.0	203.477592	5.882433	-0.056673	-1.879045	-1.008545

Clustering Vs Classification: Predicting the body type from City Fuel Economy, Highway Fuel Economy, Engine Type, Maximum seating, and Horsepower¶

A brief overview of the results¶

Results of classification¶

Clustering Vs Classification: Predicting the Price Range of a Used Car by Make, Model, Year, Mileage, Fuel Economy (CJN)¶

K-Means¶

Decision Tree¶

K-Neighbors¶

All Cars: Visualizing price by Make, Mileage, and Color¶

Problem!¶

Much Better!¶

Pie Chart of top 25 Makes¶

Violin plot of Price and Mileage vs Accidents¶

Top 500 cars by Make and Accident Status¶

Count of transmission type and year¶

Dealing with skewed Data¶

Pair Plot to examine confounding variables when interpreting price¶

All Cars: Scatterplot Comparing Engine Type and Make Name with the Hue representing Transmission Types (CJN)¶

Used Cars under 50,000 Dollars: Pair Plot Comparing Year, Mileage with the Hue representing different Price Ranges (CJN)¶