#Import the needed base libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
#Import encoders and scalers
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
#Import data set splitter
from sklearn.model_selection import train_test_split
# Import metrics module for performance evaluation
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import fowlkes_mallows_score
# Import metrics module for performance evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
# Import Clustering models
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
# Import Classification models
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
#Set the panda dataframes to display all columns
pd.set_option('display.max_columns', None)
#Set the visual theme for matplot and seaborn plots
sns.set_theme()
df = pd.read_csv("used_cars_data.csv")
df.head(10)
vin | back_legroom | bed | bed_height | bed_length | body_type | cabin | city | city_fuel_economy | combine_fuel_economy | daysonmarket | dealer_zip | description | engine_cylinders | engine_displacement | engine_type | exterior_color | fleet | frame_damaged | franchise_dealer | franchise_make | front_legroom | fuel_tank_volume | fuel_type | has_accidents | height | highway_fuel_economy | horsepower | interior_color | isCab | is_certified | is_cpo | is_new | is_oemcpo | latitude | length | listed_date | listing_color | listing_id | longitude | main_picture_url | major_options | make_name | maximum_seating | mileage | model_name | owner_count | power | price | salvage | savings_amount | seller_rating | sp_id | sp_name | theft_title | torque | transmission | transmission_display | trimId | trim_name | vehicle_damage_category | wheel_system | wheel_system_display | wheelbase | width | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ZACNJABB5KPJ92081 | 35.1 in | NaN | NaN | NaN | SUV / Crossover | NaN | Bayamon | NaN | NaN | 522 | 960 | [!@@Additional Info@@!]Engine: 2.4L I4 ZERO EV... | I4 | 1300.0 | I4 | Solar Yellow | NaN | NaN | True | Jeep | 41.2 in | 12.7 gal | Gasoline | NaN | 66.5 in | NaN | 177.0 | Black | NaN | NaN | NaN | True | NaN | 18.3988 | 166.6 in | 2019-04-06 | YELLOW | 237132766 | -66.1582 | https://static.cargurus.com/images/forsale/202... | ['Quick Order Package'] | Jeep | 5 seats | 7.0 | Renegade | NaN | 177 hp @ 5,750 RPM | 23141.0 | NaN | 0 | 2.8 | 370599.0 | Flagship Chrysler | NaN | 200 lb-ft @ 1,750 RPM | A | 9-Speed Automatic Overdrive | t83804 | Latitude FWD | NaN | FWD | Front-Wheel Drive | 101.2 in | 79.6 in | 2019 |
1 | SALCJ2FX1LH858117 | 38.1 in | NaN | NaN | NaN | SUV / Crossover | NaN | San Juan | NaN | NaN | 207 | 922 | [!@@Additional Info@@!]Keyless Entry,Ebony Mor... | I4 | 2000.0 | I4 | Narvik Black | NaN | NaN | True | Land Rover | 39.1 in | 17.7 gal | Gasoline | NaN | 68 in | NaN | 246.0 | Black (Ebony) | NaN | NaN | NaN | True | NaN | 18.4439 | 181 in | 2020-02-15 | BLACK | 265946296 | -66.0785 | https://static.cargurus.com/images/forsale/202... | ['Adaptive Cruise Control'] | Land Rover | 7 seats | 8.0 | Discovery Sport | NaN | 246 hp @ 5,500 RPM | 46500.0 | NaN | 0 | 3.0 | 389227.0 | Land Rover San Juan | NaN | 269 lb-ft @ 1,400 RPM | A | 9-Speed Automatic Overdrive | t86759 | S AWD | NaN | AWD | All-Wheel Drive | 107.9 in | 85.6 in | 2020 |
2 | JF1VA2M67G9829723 | 35.4 in | NaN | NaN | NaN | Sedan | NaN | Guaynabo | 17.0 | NaN | 1233 | 969 | NaN | H4 | 2500.0 | H4 | None | False | False | True | FIAT | 43.3 in | 15.9 gal | Gasoline | False | 58.1 in | 23.0 | 305.0 | None | False | NaN | NaN | False | NaN | 18.3467 | 180.9 in | 2017-04-25 | UNKNOWN | 173473508 | -66.1098 | NaN | ['Alloy Wheels', 'Bluetooth', 'Backup Camera',... | Subaru | 5 seats | NaN | WRX STI | 3.0 | 305 hp @ 6,000 RPM | 46995.0 | False | 0 | NaN | 370467.0 | FIAT de San Juan | False | 290 lb-ft @ 4,000 RPM | M | 6-Speed Manual | t58994 | Base | NaN | AWD | All-Wheel Drive | 104.3 in | 78.9 in | 2016 |
3 | SALRR2RV0L2433391 | 37.6 in | NaN | NaN | NaN | SUV / Crossover | NaN | San Juan | NaN | NaN | 196 | 922 | [!@@Additional Info@@!]Fog Lights,7 Seat Packa... | V6 | 3000.0 | V6 | Eiger Gray | NaN | NaN | True | Land Rover | 39 in | 23.5 gal | Gasoline | NaN | 73 in | NaN | 340.0 | Gray (Ebony/Ebony/Ebony) | NaN | NaN | NaN | True | NaN | 18.4439 | 195.1 in | 2020-02-26 | GRAY | 266911050 | -66.0785 | https://static.cargurus.com/images/forsale/202... | NaN | Land Rover | 7 seats | 11.0 | Discovery | NaN | 340 hp @ 6,500 RPM | 67430.0 | NaN | 0 | 3.0 | 389227.0 | Land Rover San Juan | NaN | 332 lb-ft @ 3,500 RPM | A | 8-Speed Automatic Overdrive | t86074 | V6 HSE AWD | NaN | AWD | All-Wheel Drive | 115 in | 87.4 in | 2020 |
4 | SALCJ2FXXLH862327 | 38.1 in | NaN | NaN | NaN | SUV / Crossover | NaN | San Juan | NaN | NaN | 137 | 922 | [!@@Additional Info@@!]Keyless Entry,Ebony Mor... | I4 | 2000.0 | I4 | Narvik Black | NaN | NaN | True | Land Rover | 39.1 in | 17.7 gal | Gasoline | NaN | 68 in | NaN | 246.0 | Black (Ebony) | NaN | NaN | NaN | True | NaN | 18.4439 | 181 in | 2020-04-25 | BLACK | 270957414 | -66.0785 | https://static.cargurus.com/images/forsale/202... | ['Adaptive Cruise Control'] | Land Rover | 7 seats | 7.0 | Discovery Sport | NaN | 246 hp @ 5,500 RPM | 48880.0 | NaN | 0 | 3.0 | 389227.0 | Land Rover San Juan | NaN | 269 lb-ft @ 1,400 RPM | A | 9-Speed Automatic Overdrive | t86759 | S AWD | NaN | AWD | All-Wheel Drive | 107.9 in | 85.6 in | 2020 |
5 | SALYK2EX1LA261711 | 37.1 in | NaN | NaN | NaN | SUV / Crossover | NaN | San Juan | NaN | NaN | 242 | 922 | [!@@Additional Info@@!]Tires: 21' All-Season,A... | I4 | 2000.0 | I4 | Kaikoura Stone | False | False | True | Land Rover | 40.2 in | 16.6 gal | Gasoline | False | 66.3 in | NaN | 247.0 | Brown (Ebony / Ebony) | False | NaN | NaN | True | NaN | 18.4439 | 188.9 in | 2020-01-11 | UNKNOWN | 262940541 | -66.0785 | https://static.cargurus.com/images/forsale/202... | ['Leather Seats', 'Sunroof/Moonroof', 'Navigat... | Land Rover | 5 seats | 12.0 | Range Rover Velar | NaN | 247 hp @ 5,500 RPM | 66903.0 | False | 0 | 3.0 | 389227.0 | Land Rover San Juan | False | 269 lb-ft @ 1,200 RPM | A | 8-Speed Automatic Overdrive | t85614 | P250 R-Dynamic S AWD | NaN | AWD | All-Wheel Drive | 113.1 in | 84.4 in | 2020 |
6 | 3MZBPABL6KM107908 | 35.1 in | NaN | NaN | NaN | Sedan | NaN | Bayamon | NaN | NaN | 447 | 960 | [!@@Additional Info@@!]4-Wheel Disc Brakes,A/C... | I4 | 2500.0 | I4 | SONIC SILVER | NaN | NaN | True | Jeep | 42.3 in | 13.2 gal | Gasoline | NaN | 56.9 in | NaN | 186.0 | Black | NaN | NaN | NaN | True | NaN | 18.3988 | 183.5 in | 2019-06-20 | SILVER | 244110426 | -66.1582 | https://static.cargurus.com/images/forsale/201... | ['Alloy Wheels', 'Bluetooth', 'Backup Camera'] | Mazda | 5 seats | 14.0 | MAZDA3 | NaN | 186 hp @ 6,000 RPM | 23695.0 | NaN | 0 | 2.8 | 370599.0 | Flagship Chrysler | NaN | 186 lb-ft @ 4,000 RPM | A | 6-Speed Automatic Overdrive | t85256 | Sedan FWD | NaN | FWD | Front-Wheel Drive | 107.3 in | 70.7 in | 2019 |
7 | SALYK2EX5LA275434 | 37.1 in | NaN | NaN | NaN | SUV / Crossover | NaN | San Juan | NaN | NaN | 70 | 922 | [!@@Additional Info@@!]Tires: 21' All-Season,A... | I4 | 2000.0 | I4 | Fuji White | NaN | NaN | True | Land Rover | 40.2 in | 16.6 gal | Gasoline | NaN | 66.3 in | NaN | 247.0 | White (Eclipse / Ebony / Ebony) | NaN | NaN | NaN | True | NaN | 18.4439 | 188.9 in | 2020-07-01 | WHITE | 275458784 | -66.0785 | https://static.cargurus.com/images/forsale/202... | ['Adaptive Cruise Control'] | Land Rover | 5 seats | 11.0 | Range Rover Velar | NaN | 247 hp @ 5,500 RPM | 68520.0 | NaN | 0 | 3.0 | 389227.0 | Land Rover San Juan | NaN | 269 lb-ft @ 1,200 RPM | A | 8-Speed Automatic Overdrive | t85614 | P250 R-Dynamic S AWD | NaN | AWD | All-Wheel Drive | 113.1 in | 84.4 in | 2020 |
8 | SALCJ2FX6LH858128 | 38.1 in | NaN | NaN | NaN | SUV / Crossover | NaN | San Juan | NaN | NaN | 196 | 922 | [!@@Additional Info@@!]Keyless Entry,Chrome Wh... | I4 | 2000.0 | I4 | Eiger Gray | NaN | NaN | True | Land Rover | 39.1 in | 17.7 gal | Gasoline | NaN | 68 in | NaN | 246.0 | Black (Ebony) | NaN | NaN | NaN | True | NaN | 18.4439 | 181 in | 2020-02-26 | GRAY | 266911040 | -66.0785 | https://static.cargurus.com/images/forsale/202... | ['Navigation System', 'Backup Camera', 'Blind ... | Land Rover | 7 seats | 8.0 | Discovery Sport | NaN | 246 hp @ 5,500 RPM | 51245.0 | NaN | 0 | 3.0 | 389227.0 | Land Rover San Juan | NaN | 269 lb-ft @ 1,400 RPM | A | 9-Speed Automatic Overdrive | t86759 | S AWD | NaN | AWD | All-Wheel Drive | 107.9 in | 85.6 in | 2020 |
9 | SALZL2GX4LH007593 | 33.8 in | NaN | NaN | NaN | SUV / Crossover | NaN | San Juan | NaN | NaN | 510 | 922 | [!@@Additional Info@@!]4-Wheel Disc Brakes,A/C... | I4 | 2000.0 | I4 | Blanco | False | False | True | Land Rover | 40 in | 17.7 gal | Gasoline | False | 64.9 in | NaN | 296.0 | Eclipse/Ebony | False | NaN | NaN | False | NaN | 18.4439 | 172.1 in | 2019-04-18 | WHITE | 238225156 | -66.0785 | https://static.cargurus.com/images/forsale/202... | ['Leather Seats', 'Navigation System', 'Adapti... | Land Rover | 5 seats | 254.0 | Range Rover Evoque | NaN | 296 hp @ 5,500 RPM | 84399.0 | False | 0 | 3.0 | 389227.0 | Land Rover San Juan | False | 295 lb-ft @ 1,600 RPM | A | 9-Speed Automatic Overdrive | t85531 | P300 R-Dynamic SE AWD | NaN | AWD | All-Wheel Drive | 105.6 in | 82.7 in | 2020 |
df2 = df
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3000040 entries, 0 to 3000039 Data columns (total 66 columns): # Column Dtype --- ------ ----- 0 vin object 1 back_legroom object 2 bed object 3 bed_height object 4 bed_length object 5 body_type object 6 cabin object 7 city object 8 city_fuel_economy float64 9 combine_fuel_economy float64 10 daysonmarket int64 11 dealer_zip object 12 description object 13 engine_cylinders object 14 engine_displacement float64 15 engine_type object 16 exterior_color object 17 fleet object 18 frame_damaged object 19 franchise_dealer bool 20 franchise_make object 21 front_legroom object 22 fuel_tank_volume object 23 fuel_type object 24 has_accidents object 25 height object 26 highway_fuel_economy float64 27 horsepower float64 28 interior_color object 29 isCab object 30 is_certified float64 31 is_cpo object 32 is_new bool 33 is_oemcpo object 34 latitude float64 35 length object 36 listed_date object 37 listing_color object 38 listing_id int64 39 longitude float64 40 main_picture_url object 41 major_options object 42 make_name object 43 maximum_seating object 44 mileage float64 45 model_name object 46 owner_count float64 47 power object 48 price float64 49 salvage object 50 savings_amount int64 51 seller_rating float64 52 sp_id float64 53 sp_name object 54 theft_title object 55 torque object 56 transmission object 57 transmission_display object 58 trimId object 59 trim_name object 60 vehicle_damage_category float64 61 wheel_system object 62 wheel_system_display object 63 wheelbase object 64 width object 65 year int64 dtypes: bool(2), float64(14), int64(4), object(46) memory usage: 1.4+ GB
df=df[['make_name','model_name', 'year', 'body_type', 'listing_color', 'mileage', 'is_new', 'owner_count', 'has_accidents', 'price', 'maximum_seating', 'horsepower', 'engine_type', 'city_fuel_economy', 'highway_fuel_economy', 'transmission']]
df.head(10)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3000040 entries, 0 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 make_name object 1 model_name object 2 year int64 3 body_type object 4 listing_color object 5 mileage float64 6 is_new bool 7 owner_count float64 8 has_accidents object 9 price float64 10 maximum_seating object 11 horsepower float64 12 engine_type object 13 city_fuel_economy float64 14 highway_fuel_economy float64 15 transmission object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 346.2+ MB
df.replace(' ', '_', regex=True)
df.columns = df.columns.str.replace('_',' ')
df.columns = df.columns.str.upper()
df.head(10)
MAKE NAME | MODEL NAME | YEAR | BODY TYPE | LISTING COLOR | MILEAGE | IS NEW | OWNER COUNT | HAS ACCIDENTS | PRICE | MAXIMUM SEATING | HORSEPOWER | ENGINE TYPE | CITY FUEL ECONOMY | HIGHWAY FUEL ECONOMY | TRANSMISSION | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Jeep | Renegade | 2019 | SUV / Crossover | YELLOW | 7.0 | True | NaN | NaN | 23141.0 | 5 seats | 177.0 | I4 | NaN | NaN | A |
1 | Land Rover | Discovery Sport | 2020 | SUV / Crossover | BLACK | 8.0 | True | NaN | NaN | 46500.0 | 7 seats | 246.0 | I4 | NaN | NaN | A |
2 | Subaru | WRX STI | 2016 | Sedan | UNKNOWN | NaN | False | 3.0 | False | 46995.0 | 5 seats | 305.0 | H4 | 17.0 | 23.0 | M |
3 | Land Rover | Discovery | 2020 | SUV / Crossover | GRAY | 11.0 | True | NaN | NaN | 67430.0 | 7 seats | 340.0 | V6 | NaN | NaN | A |
4 | Land Rover | Discovery Sport | 2020 | SUV / Crossover | BLACK | 7.0 | True | NaN | NaN | 48880.0 | 7 seats | 246.0 | I4 | NaN | NaN | A |
5 | Land Rover | Range Rover Velar | 2020 | SUV / Crossover | UNKNOWN | 12.0 | True | NaN | False | 66903.0 | 5 seats | 247.0 | I4 | NaN | NaN | A |
6 | Mazda | MAZDA3 | 2019 | Sedan | SILVER | 14.0 | True | NaN | NaN | 23695.0 | 5 seats | 186.0 | I4 | NaN | NaN | A |
7 | Land Rover | Range Rover Velar | 2020 | SUV / Crossover | WHITE | 11.0 | True | NaN | NaN | 68520.0 | 5 seats | 247.0 | I4 | NaN | NaN | A |
8 | Land Rover | Discovery Sport | 2020 | SUV / Crossover | GRAY | 8.0 | True | NaN | NaN | 51245.0 | 7 seats | 246.0 | I4 | NaN | NaN | A |
9 | Land Rover | Range Rover Evoque | 2020 | SUV / Crossover | WHITE | 254.0 | False | NaN | False | 84399.0 | 5 seats | 296.0 | I4 | NaN | NaN | A |
Next I noticed that the BODY TYPE column had incomplete data. I cleaned the data by dropping the rows with incomplete data.
#The indexes of cars with incomplete data for body type
incompleteBodyType = df[df['BODY TYPE'].isna()].index
#Drop the rows
df = df.drop(incompleteBodyType)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2986497 entries, 0 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 367.4+ MB
Next I noticed that the IS NEW column has complete data, however the OWNER COUNT column which should be related to the IS NEW column had some missing values. I cleaned the data by comparing the IS NEW column and the OWNER COUNT columns. Comparison 1: The car is used but the owner count is not listed. In this case I dropped the rows because we did not want any incomplete information in the data. Comparision 2: The car is new but the owner count is not na. In this case I dropped the rows because we did not any misleading information in the data. Comparison 3: The car is new but the owner count is na. In this case I want to change the value from na to 0.0.
#The indexes of used cars with incomplete data
usedIncompleteData = df[(df['IS NEW']== False) & (df['OWNER COUNT'].isna())].index
#Drop the rows
df = df.drop(usedIncompleteData)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2940141 entries, 0 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 361.7+ MB
#The indexes of new cars with misleading data
newIncorrectData = df[(df['IS NEW']== True) & (df['OWNER COUNT'].notna())].index
#Drop the rows
df = df.drop(newIncorrectData)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2939178 entries, 0 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 361.6+ MB
#Fill the rest of the na values in owner count with 0.0
df['OWNER COUNT'].fillna(0.0,inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2939178 entries, 0 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 361.6+ MB
Next I noticed the missing values in the MILEAGE column, however this column should also relate to the IS NEW column. We cleaned the data by comparing the IS NEW column and the MILEAGE columns. For my purposes new cars will have 0.0 mileage. A new car with less than 200 miles will have its mileage changed to 0. Anything above a 200 will be droppped because the data is incorrect. Comparison 1: The car is used but the mileage data is missing. I dropped the rows becuase we do not want incomplete data. Comparision 2: The car is new but the mileage data is greater than 200. I droppped the rows because the data is incorrect. Comparision 3: The car is new but the mileage data is less than 200. I changed the values to 0.0 because a new car in our data will have 0.0 mileage. Comparison 4: The car is new but the mileage data is na. I changed the values from na to 0.0.
#The indexes of used cars with incomplete data
usedIncompleteData = df[(df['IS NEW'] == False) & (df['MILEAGE'].isna())].index
#Drop the rows
df = df.drop(usedIncompleteData)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2929176 entries, 0 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 360.4+ MB
#The indexes of new cars with mileage over 200
newOver200 = df[(df['IS NEW'] == True) & (df['MILEAGE'].notna()) & (df['MILEAGE'] > 200)].index
#Drop the rows
df = df.drop(newOver200)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2870785 entries, 0 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 353.2+ MB
#The indexes of new cars with mileage over 200
newUnder200 = df[(df['IS NEW'] == True) & (df['MILEAGE'].notna()) & (df['MILEAGE'] < 200)].index
#Change the mileage to 0.0
df.loc[newUnder200,'MILEAGE'] = 0.0
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2870785 entries, 0 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 417.7+ MB
#The indexes of new cars with mileage na
newNAMileage = df[(df['IS NEW'] == True) & (df['MILEAGE'].isna())]
#Change the mileage to 0.0
df['MILEAGE'].fillna(0.0,inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2870785 entries, 0 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 417.7+ MB
Next I noticed the missing values in the HAS ACCIDENTS column, however this column should also relate to the IS NEW column. I cleaned the data by comparing the IS NEW column and the HAS ACCIDENTS columns. Comparison 1: The car is used but the HAS ACCIDENTS data is missing. We dropped the rows becuase I did not want incomplete data. Comparision 3: The car is new but the HAS ACCIDENTS data is True. I droppped the rows because the data is incorrect. Comparison 4: The car is new but the HAS ACCIDENTS data is na. I changed the values from na to False.
#The indexes of used cars with accidents na
usedAccidentsNA = df[(df['IS NEW'] == False) & (df['HAS ACCIDENTS'].isna())].index
#Drop the rows
df = df.drop(usedAccidentsNA)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2870785 entries, 0 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 417.7+ MB
#The indexes of new cars with accidents
newAccidents = df[(df['IS NEW'] == True) & (df['HAS ACCIDENTS'] == True)].index
#Drop the rows
df = df.drop(newAccidents)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2870782 entries, 0 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 353.2+ MB
#The indexes of new cars with accidents na
newAccidentsNA = df[(df['IS NEW'] == True) & (df['HAS ACCIDENTS'].isna())].index
#Change the accidents to False
df.loc[newAccidentsNA,'HAS ACCIDENTS'] = False
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2870782 entries, 0 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 417.7+ MB
Next I noticed the missing in values in MAXIMUM SEATING, ENGINE TYPE,HORSEPOWER, TRANSMISSION, FUEL ECONOMY, and SELLER RATING. All of these columns are independant of other columns. I wanted the most complete data so any rows with incomplete data I dropped.
#The indexes of the rows with maximum seating na
seatingNA = df[df['MAXIMUM SEATING'].isna()].index
#Drop the rows
df = df.drop(seatingNA)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2733501 entries, 0 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 336.3+ MB
#The indexes of the rows with engine type na
engineNA = df[df['ENGINE TYPE'].isna()].index
#Drop the rows
df = df.drop(engineNA)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2702202 entries, 0 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 332.4+ MB
#The indexes of the rows with horsepower na
horseNA = df[df['HORSEPOWER'].isna()].index
#Drop the rows
df = df.drop(horseNA)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2685425 entries, 0 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 330.4+ MB
#The indexes of the rows with transmission na
transNA = df[df['TRANSMISSION'].isna()].index
#Drop the rows
df = df.drop(transNA)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2635664 entries, 0 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 324.2+ MB
#The indexes of the rows with fuel eco city na
cityNA = df[df['CITY FUEL ECONOMY'].isna()].index
#Drop the rows
df = df.drop(cityNA)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2357387 entries, 12 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 290.0+ MB
#The indexes of the rows with fuel eco highway na
highwayNA = df[df['HIGHWAY FUEL ECONOMY'].isna()].index
#Drop the rows
df = df.drop(highwayNA)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2357387 entries, 12 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 354.5+ MB
Lastly I found that the LISTING COLOR column while not having any missing values did have the string 'UNKNOWN' for some rows. I want the most complete data we can get so we dropped the rows were the color was listed as unknown.
#The indexes of the rows with UNKNOWN color
colorNA = df[df['LISTING COLOR'] == "UNKNOWN"].index
#Drop the rows
df = df.drop(colorNA)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2033481 entries, 12 to 3000039 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 MAKE NAME object 1 MODEL NAME object 2 YEAR int64 3 BODY TYPE object 4 LISTING COLOR object 5 MILEAGE float64 6 IS NEW bool 7 OWNER COUNT float64 8 HAS ACCIDENTS object 9 PRICE float64 10 MAXIMUM SEATING object 11 HORSEPOWER float64 12 ENGINE TYPE object 13 CITY FUEL ECONOMY float64 14 HIGHWAY FUEL ECONOMY float64 15 TRANSMISSION object dtypes: bool(1), float64(6), int64(1), object(8) memory usage: 250.2+ MB
from sklearn.preprocessing import OrdinalEncoder
order_encoder_body=OrdinalEncoder()
df['BODY TYPE ENCODED'] = order_encoder_body.fit_transform(df['BODY TYPE'].to_numpy().reshape(-1, 1))
order_encoder_engine=OrdinalEncoder()
df['ENGINE TYPE ENCODED'] = order_encoder_engine.fit_transform(df['ENGINE TYPE'].to_numpy().reshape(-1, 1))
df.head(10)
MAKE NAME | MODEL NAME | YEAR | BODY TYPE | LISTING COLOR | MILEAGE | IS NEW | OWNER COUNT | HAS ACCIDENTS | PRICE | MAXIMUM SEATING | HORSEPOWER | ENGINE TYPE | CITY FUEL ECONOMY | HIGHWAY FUEL ECONOMY | TRANSMISSION | BODY TYPE ENCODED | ENGINE TYPE ENCODED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
12 | BMW | 3 Series | 2016 | Sedan | SILVER | 6903.0 | False | 2.0 | False | 58995.0 | 5 seats | 320.0 | I6 | 22.0 | 33.0 | A | 6.0 | 13.0 |
38 | Chevrolet | Malibu | 2018 | Sedan | SILVER | 42394.0 | False | 1.0 | False | 14639.0 | 5 seats | 160.0 | I4 | 27.0 | 36.0 | A | 6.0 | 6.0 |
40 | Lexus | RC 350 | 2018 | Coupe | BLACK | 62251.0 | False | 1.0 | False | 32000.0 | 4 seats | 311.0 | V6 | 18.0 | 24.0 | A | 1.0 | 20.0 |
41 | Chevrolet | Traverse | 2018 | SUV / Crossover | SILVER | 36410.0 | False | 1.0 | False | 23723.0 | 8 seats | 310.0 | V6 | 18.0 | 27.0 | A | 5.0 | 20.0 |
45 | Chevrolet | Traverse | 2017 | SUV / Crossover | SILVER | 36055.0 | False | 1.0 | True | 22422.0 | 8 seats | 281.0 | V6 | 15.0 | 22.0 | A | 5.0 | 20.0 |
47 | Jeep | Grand Cherokee | 2018 | SUV / Crossover | BLACK | 25745.0 | False | 1.0 | False | 29424.0 | 5 seats | 295.0 | V6 | 18.0 | 25.0 | A | 5.0 | 20.0 |
49 | Jeep | Compass | 2017 | SUV / Crossover | BLACK | 14607.0 | False | 1.0 | False | 17000.0 | 5 seats | 172.0 | I4 | 20.0 | 25.0 | A | 5.0 | 6.0 |
51 | Hyundai | Veloster | 2020 | Coupe | RED | 3073.0 | False | 1.0 | False | 16900.0 | 4 seats | 147.0 | I4 | 27.0 | 34.0 | A | 1.0 | 6.0 |
53 | Jeep | Grand Cherokee | 2017 | SUV / Crossover | SILVER | 16467.0 | False | 1.0 | False | 25500.0 | 5 seats | 241.0 | V6 | 18.0 | 25.0 | A | 5.0 | 20.0 |
54 | Chevrolet | Traverse | 2018 | SUV / Crossover | WHITE | 37536.0 | False | 1.0 | False | 23939.0 | 8 seats | 310.0 | V6 | 18.0 | 27.0 | A | 5.0 | 20.0 |
Next I decided to turn the seats column into a flat integers and drop any nan values for better use by algorithms.
df = df.replace(regex=[' seats'], value='')
df['MAXIMUM SEATING'] = pd.to_numeric(df['MAXIMUM SEATING'],errors = 'coerce')
#The indexes of the rows with seating as na
seatingNa = df[df['MAXIMUM SEATING'].isna()].index
#Drop the rows
df = df.drop(seatingNa)
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
df_cluster = df[['CITY FUEL ECONOMY', 'HIGHWAY FUEL ECONOMY', 'MAXIMUM SEATING', 'ENGINE TYPE ENCODED', 'HORSEPOWER']]
pca_model = PCA()
df_cluster_pca = pca_model.fit_transform(df_cluster)
pca_df_cluster = pd.DataFrame(df_cluster_pca, columns=['PC1','PC2', 'PC3', 'PC3', 'PC4'])
df = pd.merge(df, pca_df_cluster, right_index=True, left_index=True)
df.head()
MAKE NAME | MODEL NAME | YEAR | BODY TYPE | LISTING COLOR | MILEAGE | IS NEW | OWNER COUNT | HAS ACCIDENTS | PRICE | MAXIMUM SEATING | HORSEPOWER | ENGINE TYPE | CITY FUEL ECONOMY | HIGHWAY FUEL ECONOMY | TRANSMISSION | BODY TYPE ENCODED | ENGINE TYPE ENCODED | PC1 | PC2 | PC3 | PC3 | PC4 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
12 | BMW | 3 Series | 2016 | Sedan | SILVER | 6903.0 | False | 2.0 | False | 58995.0 | 5.0 | 320.0 | I6 | 22.0 | 33.0 | A | 6.0 | 13.0 | -0.065989 | -8.400893 | -5.117548 | -0.336206 | -0.941181 |
38 | Chevrolet | Malibu | 2018 | Sedan | SILVER | 42394.0 | False | 1.0 | False | 14639.0 | 5.0 | 160.0 | I4 | 27.0 | 36.0 | A | 6.0 | 6.0 | 178.779645 | 1.507774 | -0.775535 | 0.440298 | 1.815928 |
40 | Lexus | RC 350 | 2018 | Coupe | BLACK | 62251.0 | False | 1.0 | False | 32000.0 | 4.0 | 311.0 | V6 | 18.0 | 24.0 | A | 1.0 | 20.0 | 40.029696 | -8.739841 | -1.883397 | 0.018708 | 1.997441 |
41 | Chevrolet | Traverse | 2018 | SUV / Crossover | SILVER | 36410.0 | False | 1.0 | False | 23723.0 | 8.0 | 310.0 | V6 | 18.0 | 27.0 | A | 5.0 | 20.0 | 5.118766 | 6.317278 | 4.963830 | -1.261450 | -1.000357 |
45 | Chevrolet | Traverse | 2017 | SUV / Crossover | SILVER | 36055.0 | False | 1.0 | True | 22422.0 | 8.0 | 281.0 | V6 | 15.0 | 22.0 | A | 5.0 | 20.0 | -69.907054 | -8.019258 | 3.903574 | 1.197020 | -0.350823 |
print('Explained Variance Ratio')
for i in range(5):
print('PC{}: {}'.format(i+1,pca_model.explained_variance_ratio_[i]))
Explained Variance Ratio PC1: 0.9915175225698315 PC2: 0.005320779761254336 PC3: 0.002606617911790896 PC4: 0.00042965830278262663 PC5: 0.00012542145434066125
pca = PCA()
pca.fit(df_cluster)
cumsum = np.cumsum(pca.explained_variance_ratio_)
dim = np.arange(len(cumsum))+1
plt.plot(dim,cumsum,'-',lw=3);
plt.xlabel('Dimensions')
plt.ylabel('Variance Explained')
plt.title('Selecting the right number of dimensions')
plt.xticks([1,2,3,4,5])
plt.ylim([0,1.1]);
This makes sense because fuel economy and horsepower rely on engine size which makes the defining columns not independant so they essentially give similar information.
variance_explained = .95
pca = PCA(n_components = variance_explained)
pca_data = pca.fit_transform(df_cluster)
print("{} Principal Components are required to explain {} of the variation in this data.".format(pca.n_components_, variance_explained))
1 Principal Components are required to explain 0.95 of the variation in this data.
sns.stripplot(x="PC1", y="BODY TYPE", data=df)
plt.title('Used Cars Visualized in One Dimension')
Text(0.5, 1.0, 'Used Cars Visualized in One Dimension')
df_mask = df['MODEL NAME'] == 'F-150'
df_150 = df[df_mask]
df_150_PC = df_150[['PC1']]
df_all_PC = df[['PC1']]
df_150.head()
MAKE NAME | MODEL NAME | YEAR | BODY TYPE | LISTING COLOR | MILEAGE | IS NEW | OWNER COUNT | HAS ACCIDENTS | PRICE | MAXIMUM SEATING | HORSEPOWER | ENGINE TYPE | CITY FUEL ECONOMY | HIGHWAY FUEL ECONOMY | TRANSMISSION | BODY TYPE ENCODED | ENGINE TYPE ENCODED | PC1 | PC2 | PC3 | PC3 | PC4 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
381 | Ford | F-150 | 2001 | Pickup Truck | GRAY | 173122.0 | False | 2.0 | False | 3499.0 | 6.0 | 260.0 | V8 | 12.0 | 16.0 | A | 4.0 | 26.0 | -136.500942 | -1.618505 | -3.922177 | 1.227352 | -0.148023 |
477 | Ford | F-150 | 2016 | Pickup Truck | BLACK | 32168.0 | False | 2.0 | False | 22118.0 | 3.0 | 282.0 | V6 Flex Fuel Vehicle | 17.0 | 23.0 | A | 4.0 | 24.0 | -75.814952 | -9.296057 | 3.734391 | 2.014478 | 1.590737 |
496 | Ford | F-150 | 2015 | Pickup Truck | BLACK | 66211.0 | False | 2.0 | False | 35965.0 | 5.0 | 365.0 | V6 | 17.0 | 23.0 | A | 4.0 | 20.0 | 40.029696 | -8.739841 | -1.883397 | 0.018708 | 1.997441 |
551 | Ford | F-150 | 2018 | Pickup Truck | WHITE | 32299.0 | False | 1.0 | False | 22685.0 | 3.0 | 290.0 | V6 Flex Fuel Vehicle | 20.0 | 25.0 | A | 4.0 | 24.0 | 67.234743 | 2.306615 | 17.625948 | 2.654663 | -0.075816 |
1339 | Ford | F-150 | 2016 | Pickup Truck | RED | 29441.0 | False | 1.0 | False | 42742.0 | 6.0 | 365.0 | V6 | 16.0 | 22.0 | A | 4.0 | 20.0 | 203.477592 | 5.882433 | -0.056673 | -1.879045 | -1.008545 |
from sklearn.cluster import KMeans
#nine different body types
K = 9
km = KMeans(n_clusters = K)
km.fit(df_all_PC)
predictions = km.predict(df_150_PC)
print(predictions)
[2 7 3 ... 7 2 2]
feature_set = ['CITY FUEL ECONOMY', 'HIGHWAY FUEL ECONOMY', 'MAXIMUM SEATING', 'ENGINE TYPE ENCODED', 'HORSEPOWER']
features = df[feature_set]
target = df['BODY TYPE ENCODED']
from sklearn.model_selection import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.3, random_state=1)
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import fowlkes_mallows_score
num_clusters = 9
km = KMeans(n_clusters=num_clusters)
km.fit(feature_train)
predictions=km.predict(feature_test)
print("Davies-Bouldin Index:", davies_bouldin_score(feature_test, predictions))
print("Silhouette Coefficient:", silhouette_score(feature_test, predictions))
print("Adjusted Rand Score:", adjusted_rand_score(target_test, predictions))
print("Fowlkes Mallows Score:", fowlkes_mallows_score(target_test, predictions))
Davies-Bouldin Index: 0.6381341489188128 Silhouette Coefficient: 0.4984742399332719 Adjusted Rand Score: 1.2671855627134967 Fowlkes Mallows Score: 0.290885003158871
Davies-Bouldin Index is interpreted as lower scores meaning better clustering, so we achieved a relatively poor score. Our Silhouette Coefficient is not horrible because the range is [-1, 1], so our score indicates that samples were assigned to the right cluster Our adjusted rand score outright failed, most likely due to the size of the data set. Our Fowlkes Mallows score is again pretty poor because better clustering results in a higher score.
I observed that these scores where pretty bad, so I decided to see the prediction results of a classification method. In my case I chose to use a Decision Tree Classification algorithm.
features = df[['CITY FUEL ECONOMY', 'HIGHWAY FUEL ECONOMY', 'MAXIMUM SEATING', 'ENGINE TYPE ENCODED', 'HORSEPOWER']]
target = df['BODY TYPE ENCODED']
from sklearn.model_selection import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.3, random_state=1)
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(feature_train,target_train)
predictions = clf.predict(feature_test)
print(predictions)
[3. 5. 5. ... 5. 5. 5.]
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
print("Accuracy:",accuracy_score(target_test, predictions))
print("Precision:",precision_score(target_test, predictions, average = 'macro'))
print("Recall:",recall_score(target_test, predictions, average = 'macro'))
print("F1-Score:",f1_score(target_test, predictions, average = 'macro'))
Accuracy: 0.9787284341348567 Precision: 0.9499259306379962 Recall: 0.9095862490780893 F1-Score: 0.9280160189269451
All of the tests indicate better results when closer to one, so our results all indicate that the classification algorithm performed well.
For the clustering and classification predictions I decided to focus on only used cars under 50,000 dollars. I binned the price column into 5 ranges to come up with the original labels. To bin the price ranges I used the pandas function cut which cuts the data into bins based on 10000 dollar ranges. I then labeled the bins from 1 to 5 1 being the cheapest range 5 being the most expensive range. Next I label encoded both the make and model name columns. I chose k-means and Agglomerative for clustering. I chose K-neighbors and Decision Tree for classification.
#The used cars
used = df[df['IS NEW'] == False]
#The used cars under 50000
used = used[used['PRICE'] <= 50000]
#Bin labels
bin_labels = [1, 2, 3, 4, 5]
bins = [0,10000,20000,30000,40000,50000]
#Using pd cut to bin the price data
used['PRICE RANGE'] = pd.cut(used['PRICE'], bins = bins, labels=bin_labels)
# Instantiate the Label Encoder Object
label_encoder = LabelEncoder()
# Fit and transform the column
used['MAKE NAME ENCODED'] = label_encoder.fit_transform(used['MAKE NAME'])
used['MODEL NAME ENCODED'] = label_encoder.fit_transform(used['MODEL NAME'])
#Filter the desired columns
used = used[['MAKE NAME','MAKE NAME ENCODED','MODEL NAME','MODEL NAME ENCODED','YEAR','MILEAGE', 'CITY FUEL ECONOMY', 'HIGHWAY FUEL ECONOMY','PRICE','PRICE RANGE']]
#Split the dataset in two parts: feature set and target label
feature_set = used[['MAKE NAME ENCODED','MODEL NAME ENCODED','YEAR','MILEAGE', 'CITY FUEL ECONOMY', 'HIGHWAY FUEL ECONOMY']]
features = feature_set
target = used['PRICE RANGE']
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.3, random_state=1)
#Specify the number of clusters
num_clusters = 5
#Create and fit the KMeans model
km = KMeans(n_clusters=num_clusters)
km.fit(feature_train)
#Predict the target variable
predictions=km.predict(feature_test)
#Calculate internal performance evaluation measures
print("Davies-Bouldin Index:", davies_bouldin_score(feature_test, predictions))
print("Silhouette Coefficient:", silhouette_score(feature_test, predictions))
Davies-Bouldin Index: 0.5661549672831206 Silhouette Coefficient: 0.5327719692166731
The Davies-Bouldin score is poor the optimal score is 0 with lower values indicating better clustering.
#Create a Decision Tree classifier object
clf = DecisionTreeClassifier()
#Train the model using training dataset
clf = clf.fit(feature_train,target_train)
# Predict the response for test dataset
predictions = clf.predict(feature_test)
#Calculate model accuracy
print("Accuracy:",accuracy_score(target_test, predictions))
#Calculate model precision
print("Precision:",precision_score(target_test, predictions, average='micro'))
#Calculate model recall
print("Recall:",recall_score(target_test, predictions, average='micro'))
#Calculate model f1 score
print("F1-Score:",f1_score(target_test, predictions, average='micro'))
Accuracy: 0.7776631357936805 Precision: 0.7776631357936805 Recall: 0.7776631357936805 F1-Score: 0.7776631357936805
#Create a KNN classifier object
model = KNeighborsClassifier(n_neighbors=5)
#Train the model using the training dataset
model.fit(feature_train,target_train)
#Predict the target variable for test dataset
predictions = model.predict(feature_test)
#Calculate model accuracy
print("Accuracy:",accuracy_score(target_test, predictions))
#Calculate model precision
print("Precision:",precision_score(target_test, predictions, average='micro'))
#Calculate model recall
print("Recall:",recall_score(target_test, predictions, average='micro'))
#Calculate model f1 score
print("F1-Score:",f1_score(target_test, predictions, average='micro'))
Accuracy: 0.5766331601688691 Precision: 0.5766331601688691 Recall: 0.5766331601688691 F1-Score: 0.5766331601688691
#Initialize the StandardScaler
scaler = StandardScaler()
#Scale the mileage data
scaler.fit(used['MILEAGE'].values.reshape(-1,1))
used['MILEAGE STANDARDIZED'] = scaler.transform(used['MILEAGE'].values.reshape(-1,1))
scaler.fit(used['CITY FUEL ECONOMY'].values.reshape(-1,1))
used['CITY FUEL ECONOMY STANDARDIZED'] = scaler.transform(used['CITY FUEL ECONOMY'].values.reshape(-1,1))
scaler.fit(used['HIGHWAY FUEL ECONOMY'].values.reshape(-1,1))
used['HIGHWAY FUEL ECONOMY STANDARDIZED'] = scaler.transform(used['HIGHWAY FUEL ECONOMY'].values.reshape(-1,1))
#Split the dataset in two parts: feature set and target label
feature_set = used[['MAKE NAME ENCODED','MODEL NAME ENCODED','YEAR','MILEAGE STANDARDIZED', 'CITY FUEL ECONOMY STANDARDIZED', 'HIGHWAY FUEL ECONOMY STANDARDIZED']]
features = feature_set
target = used['PRICE RANGE']
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.3, random_state=1)
#Create a KNN classifier object
model = KNeighborsClassifier(n_neighbors=5)
#Train the model using the training dataset
model.fit(feature_train,target_train)
#Predict the target variable for test dataset
predictions = model.predict(feature_test)
#Calculate model accuracy
print("Accuracy:",accuracy_score(target_test, predictions))
#Calculate model precision
print("Precision:",precision_score(target_test, predictions, average='micro'))
#Calculate model recall
print("Recall:",recall_score(target_test, predictions, average='micro'))
#Calculate model f1 score
print("F1-Score:",f1_score(target_test, predictions, average='micro'))
Accuracy: 0.8263639517542476 Precision: 0.8263639517542476 Recall: 0.8263639517542476 F1-Score: 0.8263639517542476
dfMakePrice = df.groupby('MAKE NAME').mean()['PRICE']
dfMakePrice = dfMakePrice.sort_values(ascending = False)
dfMakePriceTop10 = dfMakePrice.head(10)
dfMakePriceTop10 = dfMakePriceTop10.reset_index()
dfMakePriceTop10 = dfMakePriceTop10.head(10)
import plotly.express as px
fig = px.bar(dfMakePriceTop10, x='MAKE NAME', y='PRICE',
labels={
"PRICE": "Average Price",
"MAKE NAME": "Make"
},
title="Top 10 Average Prices by Make")
fig.show()
df_damaged = df[(df['HAS ACCIDENTS'] == True)]
df_not = df[(df['HAS ACCIDENTS'] == False)]
plt.figure(figsize=(15, 10))
plt.scatter(x = df_not['PRICE'] , y = df_not['MILEAGE'], s=1, color='blue')
plt.scatter(x = df_damaged['PRICE'] , y = df_damaged['MILEAGE'], s=2, color='red')
plt.xlabel("PRICE")
plt.ylabel("MILEAGE")
plt.xlim((0, 500000))
plt.legend(['Not Damaged', 'Damaged'], loc='upper right')
plt.show()
The scatter plot and bar plot shows the presence of outliers and a skew in price which needs to be addresses for the other graphs to be meaningful. Also, we can see how the majority of accident cars are cheaper and spread across all mileage
df['LISTING COLOR'].value_counts()
BLACK 326776 WHITE 318261 SILVER 209393 GRAY 201712 BLUE 141911 RED 140168 GREEN 13208 BROWN 12099 ORANGE 6407 GOLD 5568 TEAL 3175 YELLOW 2260 PURPLE 836 PINK 39 Name: LISTING COLOR, dtype: int64
As you can see there are some heavy outliers that would make it difficult to plot anything with price. So, I am going to remove them before plotting anything else using percentiles because anything dealing with means would be influenced by the outliers.
plt.figure(figsize=(15, 5))
ax = sns.boxplot(x = 'LISTING COLOR', y = 'PRICE', data=df)
ax.set_title('PRICE by COLOR')
Text(0.5, 1.0, 'PRICE by COLOR')
# Drop the outlier observations using Percentiles
upper_limit = df['PRICE'].quantile(.999)
dfDrop = df[(df['PRICE'] < upper_limit)]
plt.figure(figsize=(15, 5))
ax = sns.boxplot(x = 'LISTING COLOR', y = 'PRICE', data=dfDrop)
ax.set_title('PRICE by COLOR')
Text(0.5, 1.0, 'PRICE by COLOR')
As we can see, some colors like pink and purple have a similar quartile price points, but not many exist on the higher end of price. This makes sense because they are less common car colors while the more common colors have a wider distribution.
make_count = df['MAKE NAME'].value_counts()
unique_makes = df['MAKE NAME'].unique()
make_count = make_count.to_frame()
make_count = make_count.reset_index()
make_count = make_count.rename({'index': 'MAKE', 'MAKE NAME': 'COUNT'}, axis='columns')
make_count_top24 = make_count.head(24)
make_count_rest = make_count.iloc[24:]
other_total = make_count_rest['COUNT'].sum()
other = {'MAKE': 'Other', 'COUNT': other_total}
make_count_top25 = make_count_top24.append(other, ignore_index = True)
plt.figure(figsize=(15, 15))
plt.pie(make_count_top25['COUNT'], labels = make_count_top25['MAKE'],
shadow= True,
autopct='%.1f%%')
plt.title('Percentage of Different Car Makes')
plt.axis('equal')
plt.show()
plt.figure(figsize=(20, 10))
ax = sns.violinplot(x = 'HAS ACCIDENTS', y = 'PRICE', data=dfDrop);
ax.set_title('Comparing price vs accident status')
ax.set_ylim((-10000, 150000))
(-10000.0, 150000.0)
This violinplot further solidifies the notion that a cars price noticably drops when it incurs an accident
plt.figure(figsize=(20, 10))
ax = sns.violinplot(x = 'HAS ACCIDENTS', y = 'MILEAGE', data=dfDrop);
ax.set_title('Comparing mileage vs accident status')
Text(0.5, 1.0, 'Comparing mileage vs accident status')
As expected the non accident section contains a lot of cars on the low end because they are new and couldn't have been in an accident.
dfTop500Mile = df.sort_values(by=["MILEAGE"], ascending = False).head(500)
count = dfTop500Mile['MAKE NAME'].value_counts().head(10)
unq = count.index.unique()
plt.figure(figsize=(12, 6))
ax = sns.swarmplot(x = 'MAKE NAME', y = 'MILEAGE', data=dfTop500Mile, hue='HAS ACCIDENTS', order = unq)
plt.legend(loc='upper right', title='HAS ACCIDENTS')
plt.title('Top 10 Makes of the top 500 cars with the most mileage')
Text(0.5, 1.0, 'Top 10 Makes of the top 500 cars with the most mileage')
Since a car with more miles has more opporutunity to be in an accident, we chose the top mileage of the top makes to see if there was a relation between make and accident occurance. Surprisingly, there doesn't seem to be any coorelation between make and accident status
with sns.axes_style('white'):
plot = sns.catplot(x = 'YEAR', data = df, aspect=1.5, kind = 'count',
hue = 'TRANSMISSION', height = 10)
plot.set_ylabels('Number of cars')
plt.show()
Interestingly, we would assume that manual cars are more liekly to decrease in count but they are actually increasing very gradually up until 2021.
# Obviously skewed data
with sns.axes_style('white'):
plot = sns.jointplot(x = 'PRICE', y = 'MILEAGE', data = dfDrop, kind = 'hex', height = 10, xlim = (-2500, 200000), ylim = (-5000, 220000))
The price and mileage categories are heavily skewed to the right, so, by square rooting the categories we can get a better sense of how mileage and price are reliant on each other.
dfDrop['MILEAGE FEATURE TRANSFORMATION'] = np.cbrt(dfDrop['MILEAGE'])
dfDrop['PRICE FEATURE TRANSFORMATION'] = np.sqrt(dfDrop['PRICE'])
with sns.axes_style('white'):
plot = sns.jointplot(x = 'PRICE FEATURE TRANSFORMATION', y = 'MILEAGE FEATURE TRANSFORMATION', data = dfDrop, kind = 'hex', height = 10, xlim = (0, 300))
plot.set_axis_labels(ylabel = 'Transformed Mileage', xlabel = 'Transformed Price')
<seaborn.axisgrid.JointGrid at 0x272d1b2e580>
So, using this graph we can show how price significantly decreases as mileage increases. But, at a certain point there is a cooresponding priced vehicle that has 0 miles on it.
ppdf = df[['HORSEPOWER', 'CITY FUEL ECONOMY', 'HIGHWAY FUEL ECONOMY', 'MILEAGE', 'HAS ACCIDENTS']]
sns.pairplot(ppdf, hue='HAS ACCIDENTS', height=2.5);
This plot is useful because we can understand how accidents play into the capabilities of a car. For example, we can see how fuel economy is worse with mileage (aka lifespan) when a car has had an accident.
#Set fig Size
plt.figure(figsize=(20, 8))
#Slant Labels
plt.xticks(rotation=45)
#Plot
sns.scatterplot(data=df, x="MAKE NAME", y="ENGINE TYPE",hue='TRANSMISSION')
<AxesSubplot:xlabel='MAKE NAME', ylabel='ENGINE TYPE'>
This plot is helpful in identifying maker trends regarding engine types and transmission types. Such as: Chevy creates the most different engine types; High end companies have less diversity in both engine type and transmission type; The most popular transmission type is A; The least popular is Dual Clutch; The most popular engine is I4.
#The used cars
pair = df[df['IS NEW'] == False]
#Used cars under 50,000
pair = pair[pair['PRICE'] <= 50000]
#Using pd qcut to bin the price data this cuts the data into equal bins by using iq ranges
pair['PRICE RANGE'] = pd.qcut(pair['PRICE'],q=5)
#Filter the desired columns
pair = pair[['YEAR','MILEAGE','PRICE RANGE']]
sns.pairplot(pair,hue='PRICE RANGE', height=5.5)
<seaborn.axisgrid.PairGrid at 0x273a83ca640>
Two interesting trends can be found by looking at these plots both of which make sense when you think about them. One the cheaper cars have higher mileages as seen in the lower right plot. Two the older cars are cheaper as seen in the upper left hand plot.