A. “Real Property Taxes” Dataset

Download the “real property taxes” dataset from the website (via OpenBaltimore), the data is located here (note you don't need to unzip it to read it into R): http://sisbid.github.io/Module1/data/Real_Property_Taxes.csv.gz

  1. Read the Property Tax data into R and call it the variable tax
library(readr)
library(stringr)
suppressPackageStartupMessages(library(dplyr))
tax = read_csv("http://sisbid.github.io/Module1/data/Real_Property_Taxes.csv.gz")
## Parsed with column specification:
## cols(
##   PropertyID = col_character(),
##   Block = col_character(),
##   Lot = col_character(),
##   Ward = col_character(),
##   Sect = col_character(),
##   PropertyAddress = col_character(),
##   LotSize = col_character(),
##   CityTax = col_character(),
##   StateTax = col_character(),
##   ResCode = col_character(),
##   AmountDue = col_character(),
##   AsOfDate = col_character(),
##   Neighborhood = col_character(),
##   PoliceDistrict = col_character(),
##   CouncilDistrict = col_double(),
##   Location = col_character()
## )
tax = as.data.frame(tax)
  1. How many addresses pay property taxes?
nrow(tax)
## [1] 238198
dim(tax)
## [1] 238198     16
  1. What is the total city and state tax paid?
options(digits=12) # so no rounding
head(tax$CityTax)
## [1] NA          NA          "$337.20"   "$1,463.45" "$1,960.26" "$2,242.00"
tax$CityTax = parse_number(tax$CityTax)
tax$StateTax = parse_number(tax$StateTax)

sum(tax$CityTax, na.rm = TRUE)
## [1] 860112763.47
sum(tax$CityTax, na.rm = TRUE)/1e6
## [1] 860.11276347
sum(tax$StateTax, na.rm = TRUE)
## [1] 43506844.27
sum(tax$StateTax, na.rm = TRUE)/1e6
## [1] 43.50684427

Combined would be 904 milion dollars paid.

  1. Subset the data to only retain those houses that are principal residences.
taxRes = tax %>% filter(ResCode == "PRINCIPAL RESIDENCE")
  1. How many such houses are there?
taxRes %>% nrow()
## [1] 113252
  1. Describe the distribution of property taxes on these residences. Use hist with certain breaks or quantile()
quantile(taxRes$StateTax,na.rm=TRUE)
##      0%     25%     50%     75%    100% 
##    0.11  100.95  139.03  200.14 3627.46
quantile(taxRes$CityTax,na.rm=TRUE)
##       0%      25%      50%      75%     100% 
##     2.25  2026.19  2790.51  4017.18 72808.22
  1. Convert the ‘LotSize’ variable to a numeric square feet variable. Tips:
tax$LotSize = str_trim(tax$LotSize) # trim to be safe
lot = tax$LotSize # for checking later

First lets take care of acres

aIndex= which(str_detect(tax$LotSize, "AC.*") | 
            str_detect(tax$LotSize, fixed(" %")))
head(aIndex)
## [1]  1  2 10 11 12 16
head(lot[aIndex])
## [1] "0.020 ACRES" "0.020 ACRES" "0.020 ACRES" "0.020 ACRES" "0.020 ACRES"
## [6] "0.020 ACRES"
acre = tax$LotSize[aIndex] # temporary variable
## find and replace character strings
acre = str_replace_all(acre, " AC.*","")
acre = str_replace_all(acre, " %","")
table(!is.na(as.numeric(acre)))
## Warning in table(!is.na(as.numeric(acre))): NAs introduced by coercion
## 
## FALSE  TRUE 
##   236 18533
head(acre[is.na(as.numeric(acre))],50)
## Warning in head(acre[is.na(as.numeric(acre))], 50): NAs introduced by
## coercion
##  [1] "2-158"          "O.084"          "1-303"          "0.104ACRES"    
##  [5] "1-87"           "1-85"           "1-8"            "1-778"         
##  [9] "2-423"          "1-791"          "1-81"           "1-0171"        
## [13] "O.018"          "IMP ONLY 0.190" "26,140"         "5-25"          
## [17] "8.OOO"          "1-1383"         "1-692"          "IMP.ONLY 3.615"
## [21] "2-688"          "1-566"          "0.073ACRES"     "1.914ACRES"    
## [25] "2-364"          "3-67"           "3-46"           "3-4"           
## [29] "2-617"          "2-617"          "2-617"          "4-40"          
## [33] "3-94"           "2-456"          "2-486"          "2-24"          
## [37] "1-36"           "21-8X5.5"       "3-14"           "O-70"          
## [41] "16-509"         "2-36"           "2-657"          "1-005"         
## [45] "0.029ACRES"     "0.053ACRES"     "0.041ACRES"     "0.047ACRES"    
## [49] "0.048ACRES"     "0.039ACRES"
## lets clean the rest
acre = str_replace_all(acre, "-",".") # hyphen instead of decimal
head(acre[is.na(as.numeric(acre))])
## Warning in head(acre[is.na(as.numeric(acre))]): NAs introduced by coercion
## [1] "O.084"          "0.104ACRES"     "O.018"          "IMP ONLY 0.190"
## [5] "26,140"         "8.OOO"
table(!is.na(as.numeric(acre)))
## Warning in table(!is.na(as.numeric(acre))): NAs introduced by coercion
## 
## FALSE  TRUE 
##    96 18673
acre = str_replace_all(acre, "ACRES","")
head(acre[is.na(as.numeric(acre))])
## Warning in head(acre[is.na(as.numeric(acre))]): NAs introduced by coercion
## [1] "O.084"          "O.018"          "IMP ONLY 0.190" "26,140"        
## [5] "8.OOO"          "IMP.ONLY 3.615"
# take care of individual mistakes
acre = str_replace_all(acre, "O","0") # 0 vs O
acre = str_replace_all(acre, "Q","") # Q, oops
acre = str_replace_all(acre, ",.",".") # extra ,
acre = str_replace_all(acre, ",","") # extra ,
acre = str_replace_all(acre, "L","0") # leading L
acre[is.na(as.numeric(acre))]
## Warning: NAs introduced by coercion
## [1] "IMP 0N0Y 0.190" "IMP.0N0Y 3.615" "21.8X5.5"
acre2 = as.numeric(acre)*43560 
## Warning: NAs introduced by coercion
sum(is.na(acre2)) # all but 3
## [1] 3

Now let’s convert all of the square feet variables

library(purrr)
fIndex = which(str_detect(tax$LotSize, "X"))

ft = tax$LotSize[fIndex]

ft = str_replace_all(ft, fixed("&"), "-")
ft = str_replace_all(ft, "IMP ONLY ", "")
ft = str_replace_all(ft, "`","1")

ft= map_chr(str_split(ft, " "), first)

## now get the widths and lengths
width = map_chr(str_split(ft,"X"), first)
length = map_chr(str_split(ft,"X"), nth, 2) 

## width
widthFeet = as.numeric(map_chr(str_split(width, "-"), first))
## Warning: NAs introduced by coercion
widthInch = as.numeric(map_chr(str_split(width, "-"),nth,2))/12
widthInch[is.na(widthInch)] = 0 # when no inches present
totalWidth = widthFeet + widthInch # add together

# length
lengthFeet = as.numeric(map_chr(str_split(length, "-"),first))
lengthInch = as.numeric(map_chr(str_split(length, "-",2),nth,2))/12
## Warning: NAs introduced by coercion
lengthInch[is.na(lengthInch)] = 0 # when no inches present
totalLength = lengthFeet + lengthInch

# combine together for square feet
sqrtFt = totalWidth*totalLength 
ft[is.na(sqrtFt)] # what is left?
##  [1] "Q8X120"            "161X"              "11XX0"            
##  [4] "X134"              "M2X169-9"          "{1-4X128-6"       
##  [7] "13XX5"             "37-1X-60-10X57-11" "F7-10X80"         
## [10] "POINT"             "22-11"             "ASSESS"           
## [13] "ASSESSCTY56-2X1-1" "ASSESSCTY53-2X1-1" "O-7X125"          
## [16] "O-3X125"           "12-3XX4-2"         "N1-8X92-6"        
## [19] "POINTX100-5X10"    "]7X100"            "16-11XX5"         
## [22] "Q5X119"

And now we combine everything together:

tax$sqft = rep(NA)
tax$sqft[aIndex] = acre2
tax$sqft[fIndex] = sqrtFt
mean(!is.na(tax$sqft))
## [1] 0.934365527838
# already in square feet, easy!!
sIndex=which(str_detect(tax$LotSize, "FT") | str_detect(tax$LotSize, "S.*F."))
sf = tax$LotSize[sIndex] # subset temporary variable

sqft2 = map_chr(str_split(sf,"( |SQ|SF)"),first)
sqft2 = as.numeric(str_replace_all(sqft2, ",", "")) # remove , and convert
## Warning: NAs introduced by coercion
tax$sqft[sIndex] = sqft2
table(is.na(tax$sqft)) 
## 
##  FALSE   TRUE 
## 238006    192
## progress!

#what remains?
lot[is.na(tax$sqft)]
##   [1] "IMPROVEMENTS ONLY" "0.040"             "AIR RIGHTS"       
##   [4] "0.067"             "0.858"             "Q8X120"           
##   [7] "161X 137"          "0.030"             "IMP ONLY"         
##  [10] "0.036 ARES"        "11XX0"             "IMPROVEMENT ONLY" 
##  [13] "IMPROVEMENT ONLY"  "IMPROVEMENT ONLY"  "IMPROVEMENT ONLY" 
##  [16] "IMPROVEMENT ONLY"  "0.033"             "IMPROVEMENT ONLY" 
##  [19] "IMPROVEMENT ONLY"  "IMP ONLY"          "IMP ONLY 0.190 AC"
##  [22] "IMP ONLY"          "AIR RIGHTS"        "AIR RIGHTS"       
##  [25] "AIR RIGHTS"        "IMP ONLY"          "IMP ONLY"         
##  [28] "AIR RIGHTS"        "IMP ONLY"          "0.013"            
##  [31] NA                  "IMP ONLY"          "IMP ONLY"         
##  [34] "IMP ONLY"          "IMP ONLY"          NA                 
##  [37] NA                  NA                  "196552-8"         
##  [40] "IMP.ONLY 3.615 AC" "X134"              "M2X169-9"         
##  [43] "{1-4X128-6"        "IMP ONLY"          "IMP ONLY"         
##  [46] "IMPROVEMENTS ONLY" "IMPROVEMENTS ONLY" "IMPROVEMENTS ONLY"
##  [49] "0.016"             NA                  "0.230"            
##  [52] "0.026"             "\\560 S.F."        "1233.04"          
##  [55] "CO-57 (IMP ONLY)"  "IMP ONLY"          "0.649"            
##  [58] "ASSESSMENT ONLY"   "11979"             "IMPROVEMENT ONLY" 
##  [61] NA                  "IMP ONLY"          "2381437.2 CUBIC F"
##  [64] "03281969"          "IMPROVEMENTS ONLY" "IMP ONLY"         
##  [67] "0.028"             "IMPROVEMENT ONLY"  "IMP ONLY"         
##  [70] "IMP ONLY"          "IMPROVEMENT ONLY"  "IMP ONLY"         
##  [73] "IMP ONLY"          "AIR RIGHTS ONLY"   "IMPROVEMENT ONLY" 
##  [76] "IMPROVEMENT ONLY"  "IMPROVEMENT ONLY"  "IMPROVEMENT ONLY" 
##  [79] "0.681 ARCES"       "IMP ONLY"          "IMP ONLY"         
##  [82] "IMP ONLY"          "IMPROVEMENT ONLY"  "IMP ONLY"         
##  [85] "IMP ONLY"          "AIR RIGHTS"        "0.5404%"          
##  [88] "IMPROVEMENT ONLY"  "IMP ONLY"          "13XX5"            
##  [91] "19520819"          "37-1X-60-10X57-11" "0.024"            
##  [94] "F7-10X80"          "1159-0 S.F."       "IMPROVEMENT ONLY" 
##  [97] NA                  "0.039"             "20080211"         
## [100] "IMPROVEMENT ONLY"  "IMP ONLY"          "IMP ONLY"         
## [103] "IMPROVEMENT ONLY"  "24179D096"         "IMP ONLY"         
## [106] "IMP ONLY"          "7,138 SF"          "13,846 SF"        
## [109] "9,745 SF"          "13,772 SF"         "17,294 SF"        
## [112] "15,745 SF"         "18, 162 SF"        "POINT X 57-9"     
## [115] "01185"             "AIR RIGHTS"        "IMPROVEMENT ONLY" 
## [118] "IMPROVEMENTS ONLY" "22-11 X57"         "0.129%"           
## [121] "0.129%"            "0.129%"            "0.129%"           
## [124] "0.129%"            "ASSESS CTY 84 S.F" "68I.0 SQ FT"      
## [127] "IMPROVEMENT ONLY"  "ASSESS CTY 50 S.F" "ASSESS CTY 57X1-1"
## [130] "IMP ONLY"          "ASSESSCTY56-2X1-1" "ASSESSCTY53-2X1-1"
## [133] "IMPROVEMENT ONLY"  "IMPROVEMENT ONLY"  "O-7X125"          
## [136] "O-3X125"           "0.026"             "IMPROVEMENT ONLY" 
## [139] "IMPROVEMENT ONLY"  "12-3XX4-2"         "ASSESS CTY"       
## [142] "IMP ONLY"          "IMP ONLY"          "IMP ONLY"         
## [145] "IMP ONLY"          "IMPROVEMENTS ONLY" "IMPROVEMENT ONLY" 
## [148] "0.032"             "20050712"          "IMP ONLY"         
## [151] "IMP ONLY"          "0.033"             "IMP.ONLY"         
## [154] "IMP ONLY"          "REAR PART 697 S.F" "IMP ONLY"         
## [157] "IMPROVEMENT ONLY"  "N1-8X92-6"         "POINTX100-5X10"   
## [160] "IMPROVEMENTS ONLY" "]7X100"            "07031966"         
## [163] "13O4 SQ FT"        "05081978"          "IMPROVEMENTS ONLY"
## [166] "AIR RIGHTS"        "AIR RIGHTS"        "IMPROVEMENT ONLY" 
## [169] "IMPROVEMENT ONLY"  "IMPROVEMENT ONLY"  "IMPROVEMENT ONLY" 
## [172] "0.191 ARES"        "19815-4"           "11`79"            
## [175] NA                  NA                  "IMPROVEMENT ONLY" 
## [178] "IMP ONLY"          "194415"            "IMP ONLY"         
## [181] "IMPROVEMENT ONLY"  "IMP ONLY"          "120-103"          
## [184] "IMP ONLY"          "IMP. ONLY"         "196700"           
## [187] "IMPROVEMENT ONLY"  "16-11XX5"          "Q5X119"           
## [190] "21210"             "IMPROVEMENT ONLY"  "5306120"