Ames Housing Data: The Ames Housing dataset was compiled by Dean De Cock for use in data science education and expanded version of the often-cited Boston Housing dataset. The dataset provided has 80 features and 1459 instances.
Dataset description is as below:
For demo few columns are displayed but there are a lot more columns are there in the dataset.
Examples:
Input Attached Dataset: Ames_housing_dataset
Code :
# SparkSession is now the entry point of Spark # SparkSession can also be construed as gateway to spark libraries import pyspark from pyspark.sql import SparkSession # create instance of spark class spark = SparkSession.builder.appName( 'ames_housing_price_model' ).getOrCreate() df_train = spark.read.csv(r 'D:\python coding\pyspark_tutorial\Linear regression' '\housing price multiple features' '\house-prices-advanced-regression-techniques' '\train.csv' , inferSchema = True , header = True ) |
Code :
# identifying the columns having less meaningful data on the basis of datatypes l_int = [] for item in df_train.dtypes: if item[ 1 ] = = 'int' : l_int.append(item[ 0 ]) print (l_int) l_str = [] for item in df_train.dtypes: if item[ 1 ] = = 'string' : l_str.append(item[ 0 ]) print (l_str) |
Output
Integer Datatypes: ['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice'] String Datatypes: ['MSZoning', 'LotFrontage', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
Code :
# identifying integer column records having less meaningful data # identifying integer column records having less meaningful data for i in df_train.columns: if i in l_int: a = 'df_train' + '.' + i ct_total = df_train.select(i).count() ct_zeros = df_train. filter ((col(i) = = 0 )).count() per_zeros = (ct_zeros / ct_total) * 100 print ( 'total count / zeros count ' + i + ' ' + str (ct_total) + ' / ' + str (ct_zeros) + ' / ' + str (per_zeros)) |
Output of zeros percentage:
total count/zeros count/zeros_percent OpenPorchSF 1460 / 656 / 44.93150684931507 total count/zeros count/zeros_percent EnclosedPorch 1460 / 1252 / 85.75342465753425 total count/zeros count/zeros_percent 3SsnPorch 1460 / 1436 / 98.35616438356163 total count/zeros count/zeros_percent ScreenPorch 1460 / 1344 / 92.05479452054794 total count/zeros count/zeros_percent PoolArea 1460 / 1453 / 99.52054794520548 total count/zeros count/zeros_percent PoolQC 1460 / 1453 / 99.52054794520548 total count/zeros count/zeros_percent Fence 1460 / 1453 / 99.52054794520548 total count/zeros count/zeros_percent MiscFeature 1460 / 1453 / 99.52054794520548 total count/zeros count/zeros_percent MiscVal 1460 / 1408 / 96.43835616438356 total count/zeros count/zeros_percent MoSold 1460 / 0 / 0.0 total count/zeros count/zeros_percent YrSold 1460 / 0 / 0.0
Code :
# above calculation gives us an insight about the useful features # now drop the columns having zeros or NA % more than 75 % df_new = df_train.drop( * [ 'BsmtFinSF2' , 'LowQualFinSF' , 'BsmtHalfBath' , 'EnclosedPorch' , '3SsnPorch' , 'ScreenPorch' , 'PoolArea' , 'PoolQC' , 'Fence' , 'MiscFeature' , 'MiscVal' , 'Alley' ]) df_new = df_new.drop( * [ 'Id' ]) # now we have the clean data to work |
Code :
# converting string to numeric feature from pyspark.ml.feature import StringIndexer from pyspark.ml import Pipeline feat_list = [ 'MSZoning' , 'LotFrontage' , 'Street' , 'LotShape' , 'LandContour' , 'Utilities' , 'LotConfig' , 'LandSlope' , 'Neighborhood' , 'Condition1' , 'Condition2' , 'BldgType' , 'HouseStyle' , 'RoofStyle' , 'RoofMatl' , 'Exterior1st' , 'Exterior2nd' , 'MasVnrType' , 'MasVnrArea' , 'ExterQual' , 'ExterCond' , 'Foundation' , 'BsmtQual' , 'BsmtCond' , 'BsmtExposure' , 'BsmtFinType1' , 'BsmtFinType2' , 'Heating' , 'HeatingQC' , 'CentralAir' , 'Electrical' , 'KitchenQual' , 'Functional' , 'FireplaceQu' , 'GarageType' , 'GarageYrBlt' , 'GarageFinish' , 'GarageQual' , 'GarageCond' , 'PavedDrive' , 'SaleType' , 'SaleCondition' ] print ( 'indexed list created' ) # there are multiple features to work # using pipeline we can convert multiple features to indexers indexers = [StringIndexer(inputCol = column, outputCol = column + "_index" ).fit(df_new) for column in feat_list] type (indexers) # Combines a given list of columns into a single vector column. # input_cols: Columns to be assembled. # returns Dataframe with assembled column. pipeline = Pipeline(stages = indexers) df_feat = pipeline.fit(df_new).transform(df_new) df_feat.columns # using above code we have converted list of features into indexes from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler # we will convert below columns into features to work with assembler = VectorAssembler(inputCols = [ 'MSSubClass' , 'LotArea' , 'OverallQual' , 'OverallCond' , 'YearBuilt' , 'YearRemodAdd' , 'BsmtFinSF1' , 'BsmtUnfSF' , 'TotalBsmtSF' , '1stFlrSF' , '2ndFlrSF' , 'GrLivArea' , 'BsmtFullBath' , 'FullBath' , 'HalfBath' , 'GarageArea' , 'MoSold' , 'YrSold' , 'MSZoning_index' , 'LotFrontage_index' , 'Street_index' , 'LotShape_index' , 'LandContour_index' , 'Utilities_index' , 'LotConfig_index' , 'LandSlope_index' , 'Neighborhood_index' , 'Condition1_index' , 'Condition2_index' , 'BldgType_index' , 'HouseStyle_index' , 'RoofStyle_index' , 'RoofMatl_index' , 'Exterior1st_index' , 'Exterior2nd_index' , 'MasVnrType_index' , 'MasVnrArea_index' , 'ExterQual_index' , 'ExterCond_index' , 'Foundation_index' , 'BsmtQual_index' , 'BsmtCond_index' , 'BsmtExposure_index' , 'BsmtFinType1_index' , 'BsmtFinType2_index' , 'Heating_index' , 'HeatingQC_index' , 'CentralAir_index' , 'Electrical_index' , 'KitchenQual_index' , 'Functional_index' , 'FireplaceQu_index' , 'GarageType_index' , 'GarageYrBlt_index' , 'GarageFinish_index' , 'GarageQual_index' , 'GarageCond_index' , 'PavedDrive_index' , 'SaleType_index' , 'SaleCondition_index' ], outputCol = 'features' ) output = assembler.transform(df_feat) final_data = output.select( 'features' , 'SalePrice' ) # splitting data for test and validation train_data, test_data = final_data.randomSplit([ 0.7 , 0.3 ]) |
Code :
train_data.describe().show() |
test_data.describe().show() |
Code :
from pyspark.ml.regression import LinearRegression house_lr = LinearRegression(featuresCol = 'features' , labelCol = 'SalePrice' ) trained_house_model = house_lr.fit(train_data) house_results = trained_house_model.evaluate(train_data) print ( 'Rsquared Error :' , house_results.r2) # Rsquared Error : 0.8279155904297449 # model accuracy is 82 % with train data # evaluate model on test_data test_results = trained_house_model.evaluate(test_data) print ( 'Rsquared error :' , test_results.r2) # Rsquared error : 0.8431420382408793 # result is quiet better with 84 % accuracy # create unlabelled data from test_data # test_data.show() unlabeled_data = test_data.select( 'features' ) unlabeled_data.show() |
Code :
predictions = trained_house_model.transform(unlabeled_data) predictions.show() |