Ingest CCIEA Metadata Database

Read Metadata

Source: ERDDAP_CCIEA_database_2021.20210317 - Google Sheets

# GoogleSheet and GoogleDrive locations
#   shared to shares@iea-uploader.iam.gserviceaccount.com
gs_url <- "https://docs.google.com/spreadsheets/d/1F8H2UFcajLVqq_MIPS0YAUt3ZnSSJP7cZ1hxCsMLW4g/edit"
gd_url <- "https://drive.google.com/drive/u/2/folders/1seUbRmpwqhOyTjuWIndBql-m6Z0wvjxx"
  
# password authorization file for shares@iea-uploader.iam.gserviceaccount.com
gs_json <- "/Volumes/GoogleDrive/My Drive/projects/iea-auto/data/iea-uploader-27c589771060.json"
stopifnot(file.exists(gs_json))

gs4_auth(path = gs_json)
drive_auth(path = gs_json)

doc <- read_sheet(gs_url, "Documentation", skip = 2)
meta <- read_sheet(gs_url, "Indicator Metadata")

sheet_names(gs_url)

## [1] "Views"              "Documentation"      "Indicator Metadata"
## [4] "files_PI"           "files_unmatched"

# "Views"              "Documentation"      "Indicator Metadata"

User Files

To test uploading experience in Meta-app.

Data Files

Ben’s ca / data:

data_files <- drive_ls(gd_url, recursive = FALSE) %>% 
  mutate(
    url  = glue("https://drive.google.com/file/d/{id}/edit"),
    gd_file = glue("<a href='{url}' target='_blank'>{name}</id>"))

data_files %>% 
  select(gd_file) %>% 
  datatable(
    escape=F,
    extensions = 'Buttons',         
    options = list(
      dom = 'Bfrtip',
      buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
      pageLength = nrow(data_files),
      lengthMenu = c(5, 20, 50, nrow(data_files))))

files_PI

Summary of indicator name in PI file in Indicator Metadata tab of ERDDAP_CCIEA_database_2021.20210317 - Google Sheet and if match in data/ files above in gd_data.

meta_files <- meta %>% 
  filter(!is.na(`PI filename`)) %>% 
  group_by(`PI filename`, `ERDDAP Dataset ID`) %>% 
  summarize(
    vars_erddap = paste(`Variable Name/ERDDAP`, collapse = ", "),
    .groups = "drop") %>% 
  mutate(
    name = basename(`PI filename`)) %>% 
  left_join(
    data_files %>% 
      select(name, url, gd_file), by = "name")

meta_files  %>% 
  select(name, `PI filename`, `ERDDAP Dataset ID`, gd_file) %>% 
  arrange(gd_file, name, `ERDDAP Dataset ID`) %>% 
  datatable(
    escape=F,
    extensions = 'Buttons',
    options = list(
      dom = 'Bfrtip',
      buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
      pageLength = nrow(meta_files),
      lengthMenu = c(5, 20, 50, nrow(meta_files))))

TODO: search for unmatched elsewhere

Like here:

2021 Data - Google Drive

Write files_pi to Google Sheet

Write to files_pi tab in ERDDAP_CCIEA_database_2021.20210317 - Google Sheet

meta_files %>% 
  select(gd_file, name, url, `PI filename`, `ERDDAP Dataset ID`, `vars_erddap`) %>% 
  arrange(gd_file, name) %>% 
  mutate(
    gd_file = ifelse(
      !is.na(url),
      glue('=HYPERLINK("{url}", "{name}")'),
      NA) %>% gs4_formula) %>% 
  select(-url) %>% 
  write_sheet(gs_url, "files_PI")

files_unmatched

Files found in Ben’s ca / data but not matched with PI files (indicator name in PI file).

files_unmatched <- data_files %>%
  anti_join(meta_files, by="name")

files_unmatched %>% 
  select(gd_file) %>% 
  datatable(
    escape=F,
    extensions = 'Buttons',
    options = list(
      dom = 'Bfrtip',
      buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
      pageLength = nrow(files_unmatched),
      lengthMenu = c(5, 20, 50, nrow(files_unmatched))))

Write files_unmatched to Google Sheet

Write to files_unmatched tab in ERDDAP_CCIEA_database_2021.20210317 - Google Sheet

files_unmatched %>% 
  mutate(
    gd_file = glue('=HYPERLINK("{url}", "{name}")') %>% 
      gs4_formula) %>% 
  select(gd_file) %>% 
  write_sheet(gs_url, "files_unmatched")

Metadata

original: Documentation

datatable(doc)

original: Indicator Metadata

datatable(meta)

Normalize Metadata

normalized: providers

providers <- meta %>% 
  select(
    pi  = PI, 
    email = Contact) %>% 
  group_by_all() %>% 
  summarize(.groups = "drop")
write_csv(providers, "data/providers.csv")
datatable(providers)

normalized: institutions

institutions <- meta %>% 
  select(
    institution = Institution) %>% 
  group_by_all() %>% 
  summarize(.groups = "drop")
write_csv(institutions, "data/institutions.csv")
datatable(institutions)

normalized: components

components <- meta %>% 
  select(
    component    = `Component Section`,
    subcomponent = Subcomponent) %>% 
  group_by_all() %>% 
  summarize(.groups = "drop")
write_csv(components, "data/components.csv")
datatable(components)

normalized: datasets

datasets <- meta %>% 
  select(
    dataset_id  = `ERDDAP Dataset ID`) %>% 
  group_by_all() %>% 
  summarize(.groups = "drop")
write_csv(datasets, "data/datasets.csv")
datatable(datasets)

normalized: timeseries

timeseries <- meta %>% 
  select(
    timeseries_id = `CCIEA timeseries ID`,
    dataset_id    = `ERDDAP Dataset ID`,
    institution   = Institution) %>% 
  group_by_all() %>% 
  summarize(.groups = "drop")
write_csv(timeseries, "data/timeseries.csv")
datatable(timeseries)

normalized: region

regions <- meta %>% 
  select(
    region = region) %>% 
  group_by_all() %>% 
  summarize(.groups = "drop")
write_csv(regions, "data/regions.csv")
datatable(regions)

normalized: vars

vars <- meta %>% 
  select(
    var_id      = `Variable Name/ERDDAP`,
    var_label   = `Y-axis label (long variable name)`,
    var_units   = `Units`) %>% 
  group_by_all() %>% 
  summarize(.groups = "drop")
write_csv(vars, "data/vars.csv")
datatable(vars)

normalized: dataset_vars

dataset_vars <- meta %>% 
  select(
    dataset_id    = `ERDDAP Dataset ID`,
    var_id        = `Variable Name/ERDDAP`,
    fld_orig      = `indicator name in PI file`,
    pi            = PI,
    region, latitude, longitude,
    institution   = Institution,
    sampling_freq = `Sampling frequency`,
    sci_name      = `Scientific name`,
    component     = `Component Section`,
    subcomponent  = Subcomponent) %>% 
  mutate(
    across(where(is.list), as.character)) %>% 
  group_by_all() %>% 
  summarize(.groups = "drop")
write_csv(dataset_vars, "data/dataset_vars.csv")
datatable(dataset_vars)