# 00 - Build the Intake-ESM Catalog
We can build an `intake-esm` catalog from the history files. During this analysis, we **do not** convert from history to timeseries.

In [1]:
from ecgtools import Builder
from ecgtools.parsers.cesm import parse_cesm_history
from config import analysis_config
import pandas as pd

In [2]:
analysis_config['case_data_paths']

['/glade/scratch/hannay/archive/b1850.f19_g17.validation_mct.004/ocn/hist',
 '/glade/scratch/hannay/archive/b1850.f19_g17.validation_mct.002/ocn/hist',
 '/glade/scratch/hannay/archive/b1850.f19_g17.validation_nuopc.004_copy2/ocn/hist']

In [3]:
b = Builder(
    # Directories with the output
    analysis_config['case_data_paths'],
    # Depth of 1 since we are sending it to the case output directory
    depth=1,
    # Exclude the timeseries and restart directories
    exclude_patterns=["*/tseries/*", "*/rest/*"],
    # Number of jobs to execute - should be equal to # threads you are using
    njobs=-1,
)

In [5]:
b.build(parse_cesm_history)

<class 'list'>
[PosixPath('/glade/scratch/hannay/archive/b1850.f19_g17.validation_mct.004/ocn/hist'), PosixPath('/glade/scratch/hannay/archive/b1850.f19_g17.validation_mct.002/ocn/hist'), PosixPath('/glade/scratch/hannay/archive/b1850.f19_g17.validation_nuopc.004_copy2/ocn/hist')]


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 370 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 568 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done 802 tasks      | elapsed:   25.5s
[Parallel(n_jobs=-1)]: Done 1072 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done 1378 tasks      | elapsed:   39.7s
[Parallel(n_jobs=-1)]: Done 1720 tasks      | elapsed:   48.1s
[Parallel(n_jobs=-1)]: Done 2098 tasks      | elapsed:   57.4s
[Parallel(n_jobs=-1)]: Done 2512 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 2962 tasks      | elapsed:  1.4min
[Paral

Builder(root_path=[PosixPath('/glade/scratch/hannay/archive/b1850.f19_g17.validation_mct.004/ocn/hist'), PosixPath('/glade/scratch/hannay/archive/b1850.f19_g17.validation_mct.002/ocn/hist'), PosixPath('/glade/scratch/hannay/archive/b1850.f19_g17.validation_nuopc.004_copy2/ocn/hist')], extension='.nc', depth=1, exclude_patterns=['*/tseries/*', '*/rest/*'], njobs=-1)

In [6]:
b.save(
    # File path - could save as .csv (uncompressed csv) or .csv.gz (compressed csv)
    analysis_config["catalog_csv"],
    # Column name including filepath
    path_column_name='path',
    # Column name including variables
    variable_column_name='variables',
    # Data file format - could be netcdf or zarr (in this case, netcdf)
    data_format="netcdf",
    # Which attributes to groupby when reading in variables using intake-esm
    groupby_attrs=["component", "stream", "case"],
    # Aggregations which are fed into xarray when reading in data using intake
    aggregations=[
        {
            "type": "join_existing",
            "attribute_name": "date",
            "options": {"dim": "time", "coords": "minimal", "compat": "override"},
        }
    ],
)

Saved catalog location: ../data/cesm-validation-catalog.json and ../data/cesm-validation-catalog.csv


