The packages we’ll need:
https://d-rug.github.io/using-geoparquet-R
01:00
.parquet
file? (and why bother!?)The parquet
format is a file type that contains a table inside similar to a .csv
However these files are stored in binary form not as plain text
parquet
files are column-oriented (unlike csv) and each column is stored independently
parquet
embeds the schema or data types/structure within the data itself
{sf}
and arrow
parquet
files aren’t just about compression (though some savings here)
The real benefit is speed reading/operating/writing data!
nhdplusTools
} to get HUC12s for CA 1parquet
file with {sf
} class# create dir to save data fs::dir_create("data") # get CA and download HUC12s ca <- tigris::states(progress_bar=FALSE) %>% filter(NAME=="California") # can specify any given option for huc8, huc10,etc huc12 <- nhdplusTools::get_huc(ca, type = "huc12") # this takes a minute or two huc12 <- sf::st_cast(huc12, "MULTIPOLYGON") # fix geometry # save out geoarrow::write_geoparquet(huc12, here::here("data/nhd_huc12.parquet"))
# create dir to save data fs::dir_create("data") # get CA and download HUC12s ca <- tigris::states(progress_bar=FALSE) %>% filter(NAME=="California") # can specify any given option for huc8, huc10,etc huc12 <- nhdplusTools::get_huc(ca, type = "huc12") # this takes a minute or two huc12 <- sf::st_cast(huc12, "MULTIPOLYGON") # fix geometry # save out geoarrow::write_geoparquet(huc12, here::here("data/nhd_huc12.parquet"))
# create dir to save data fs::dir_create("data") # get CA and download HUC12s ca <- tigris::states(progress_bar=FALSE) %>% filter(NAME=="California") # can specify any given option for huc8, huc10,etc huc12 <- nhdplusTools::get_huc(ca, type = "huc12") # this takes a minute or two huc12 <- sf::st_cast(huc12, "MULTIPOLYGON") # fix geometry # save out geoarrow::write_geoparquet(huc12, here::here("data/nhd_huc12.parquet"))
# create dir to save data fs::dir_create("data") # get CA and download HUC12s ca <- tigris::states(progress_bar=FALSE) %>% filter(NAME=="California") # can specify any given option for huc8, huc10,etc huc12 <- nhdplusTools::get_huc(ca, type = "huc12") # this takes a minute or two huc12 <- sf::st_cast(huc12, "MULTIPOLYGON") # fix geometry # save out geoarrow::write_geoparquet(huc12, here::here("data/nhd_huc12.parquet"))
# create dir to save data fs::dir_create("data") # get CA and download HUC12s ca <- tigris::states(progress_bar=FALSE) %>% filter(NAME=="California") # can specify any given option for huc8, huc10,etc huc12 <- nhdplusTools::get_huc(ca, type = "huc12") # this takes a minute or two huc12 <- sf::st_cast(huc12, "MULTIPOLYGON") # fix geometry # save out geoarrow::write_geoparquet(huc12, here::here("data/nhd_huc12.parquet"))
geoson.zip
is 466MB, unzipped 3.6GB)library(tictoc) tic() df <- geojsonsf::geojson_sf("~/Downloads/California.geojson") toc() # 257.514 sec elapsed # write it out tic() geoarrow::write_geoparquet(df, here::here("data/ca_bing_buildings.parquet")) toc() # writes in 11.745 seconds! # read it back in tic() df_parq <- geoarrow::read_geoparquet_sf(here::here("data/ca_bing_buildings.parquet")) toc() # read in 14.724 seconds!
library(tictoc) tic() df <- geojsonsf::geojson_sf("~/Downloads/California.geojson") toc() # 257.514 sec elapsed # write it out tic() geoarrow::write_geoparquet(df, here::here("data/ca_bing_buildings.parquet")) toc() # writes in 11.745 seconds! # read it back in tic() df_parq <- geoarrow::read_geoparquet_sf(here::here("data/ca_bing_buildings.parquet")) toc() # read in 14.724 seconds!
library(tictoc) tic() df <- geojsonsf::geojson_sf("~/Downloads/California.geojson") toc() # 257.514 sec elapsed # write it out tic() geoarrow::write_geoparquet(df, here::here("data/ca_bing_buildings.parquet")) toc() # writes in 11.745 seconds! # read it back in tic() df_parq <- geoarrow::read_geoparquet_sf(here::here("data/ca_bing_buildings.parquet")) toc() # read in 14.724 seconds!
library(tictoc) tic() df <- geojsonsf::geojson_sf("~/Downloads/California.geojson") toc() # 257.514 sec elapsed # write it out tic() geoarrow::write_geoparquet(df, here::here("data/ca_bing_buildings.parquet")) toc() # writes in 11.745 seconds! # read it back in tic() df_parq <- geoarrow::read_geoparquet_sf(here::here("data/ca_bing_buildings.parquet")) toc() # read in 14.724 seconds!
library(tictoc) tic() df <- geojsonsf::geojson_sf("~/Downloads/California.geojson") toc() # 257.514 sec elapsed # write it out tic() geoarrow::write_geoparquet(df, here::here("data/ca_bing_buildings.parquet")) toc() # writes in 11.745 seconds! # read it back in tic() df_parq <- geoarrow::read_geoparquet_sf(here::here("data/ca_bing_buildings.parquet")) toc() # read in 14.724 seconds!
library(tictoc) tic() df <- geojsonsf::geojson_sf("~/Downloads/California.geojson") toc() # 257.514 sec elapsed # write it out tic() geoarrow::write_geoparquet(df, here::here("data/ca_bing_buildings.parquet")) toc() # writes in 11.745 seconds! # read it back in tic() df_parq <- geoarrow::read_geoparquet_sf(here::here("data/ca_bing_buildings.parquet")) toc() # read in 14.724 seconds!
library(tigris) # get building data df_parq <- geoarrow::read_geoparquet_sf(here::here("data/ca_bing_buildings.parquet")) cnty <- counties(state="CA") sel_cnty <- cnty %>% filter(NAME=="Butte") %>% st_transform(4326) st_crs(df_parq)$epsg st_crs(sel_cnty)$epsg # crop sf_use_s2(FALSE) sel_df <- df_parq[sel_cnty,]
library(tigris) # get building data df_parq <- geoarrow::read_geoparquet_sf(here::here("data/ca_bing_buildings.parquet")) cnty <- counties(state="CA") sel_cnty <- cnty %>% filter(NAME=="Butte") %>% st_transform(4326) st_crs(df_parq)$epsg st_crs(sel_cnty)$epsg # crop sf_use_s2(FALSE) sel_df <- df_parq[sel_cnty,]
library(tigris) # get building data df_parq <- geoarrow::read_geoparquet_sf(here::here("data/ca_bing_buildings.parquet")) cnty <- counties(state="CA") sel_cnty <- cnty %>% filter(NAME=="Butte") %>% st_transform(4326) st_crs(df_parq)$epsg st_crs(sel_cnty)$epsg # crop sf_use_s2(FALSE) sel_df <- df_parq[sel_cnty,]
library(tigris) # get building data df_parq <- geoarrow::read_geoparquet_sf(here::here("data/ca_bing_buildings.parquet")) cnty <- counties(state="CA") sel_cnty <- cnty %>% filter(NAME=="Butte") %>% st_transform(4326) st_crs(df_parq)$epsg st_crs(sel_cnty)$epsg # crop sf_use_s2(FALSE) sel_df <- df_parq[sel_cnty,]
library(tigris) # get building data df_parq <- geoarrow::read_geoparquet_sf(here::here("data/ca_bing_buildings.parquet")) cnty <- counties(state="CA") sel_cnty <- cnty %>% filter(NAME=="Butte") %>% st_transform(4326) st_crs(df_parq)$epsg st_crs(sel_cnty)$epsg # crop sf_use_s2(FALSE) sel_df <- df_parq[sel_cnty,]
library(tigris) # get building data df_parq <- geoarrow::read_geoparquet_sf(here::here("data/ca_bing_buildings.parquet")) cnty <- counties(state="CA") sel_cnty <- cnty %>% filter(NAME=="Butte") %>% st_transform(4326) st_crs(df_parq)$epsg st_crs(sel_cnty)$epsg # crop sf_use_s2(FALSE) sel_df <- df_parq[sel_cnty,]
sel_df <- geoarrow::read_geoparquet_sf(here::here("data/ca_bing_buildings_butte.parquet"))
# turn off s2
sf_use_s2(FALSE)
# add area
sel_df <- sel_df %>%
mutate(build_area = st_area(geometry))
# sort by largest and select top 5% largest
sel_df_lrg <- sel_df %>%
arrange(build_area) %>%
slice_max(order_by = build_area, prop = 0.05)