2019년 7월 2일 화요일

H2O Driverless AI 1.6.2의 config.toml 파일


H2O Driverless AI를 설치하면 맨 상위 directory에 생기는 config.toml 파일입니다.   인터넷에서는 따로 찾기가 힘들길래 참조용으로 여기에 올려 둡니다.  H2O Driverless AI에서는 어떤 값들을 조정할 수 있고 또 어떤 값이 default로 되어 있는지 보시기에 좋습니다.

root@9f7e555921e5:~/dai-1.6.2-linux-ppc64le# cat config.toml
##############################################################################
##                        DRIVERLESS AI CONFIGURATION FILE
#
# Comments:
# This file is authored in TOML (see https://github.com/toml-lang/toml)
#
# Config Override Chain
# Configuration variables for Driverless AI can be provided in several ways,
# the config engine reads and overides variables in the following order
#
# 1. h2oai/config/config.toml
# [internal not visible to users]
#
# 2. config.toml
# [place file in a folder/mount file in docker container and provide path
# in "DRIVERLESS_AI_CONFIG_FILE" environment variable]
#
# 3. Environment variable
# [configuration variables can also be provided as environment variables
# they must have the prefix "DRIVERLESS_AI_" followed by
# variable name in caps e.g "authentication_method" can be provided as
# "DRIVERLESS_AI_AUTHENTICATION_METHOD"]

# Note: All floating point values < 1.0 need to start with 0.
# E.g. max_relative_cardinality = 0.95

##############################################################################
## Toml Control : Ways to control how toml parameters are set

# Whether to allow user to change non-server toml parameters per experiment in expert page
#allow_config_overrides_in_expert_page = true

# Instructions for 'Add to config.toml via toml string' in GUI expert page
# Self-referential toml parameter, for setting any other toml parameters as string of tomls separated by \n (spaces around \n are ok).
# Useful when toml parameter is not in expert mode but want per-experiment control.
# Setting this will override all other choices.
# In expert page, each time expert options saved, the new state is set without memory of any prior settings.
# The entered item is a fully compliant toml string that would be processed directly by toml.load().
# One should include 2 double quotes around the entire setting, or double quotes need to be escaped.
# One enters into the expert page text as follows:
# e.g. enable_glm=\"off\" \n enable_xgboost=\"off\" \n enable_lightgbm=\"on\"
# e.g. ""enable_glm="off" \n enable_xgboost="off" \n enable_lightgbm="off" \n enable_tensorflow="on"""
# e.g. fixed_num_individuals=4
# e.g. params_lightgbm=\"{'objective':'poisson'}\"
# e.g. ""params_lightgbm="{'objective':'poisson'}"""
# e.g. max_cores=10 \n data_precision=\"float32\" \n max_rows_feature_evolution=50000000000 \n ensemble_accuracy_switch=11 \n feature_engineering_effort=1 \n target_transformer=\"identity\" \n tournament_feature_style_accuracy_switch=5 \n params_tensorflow=\"{'layers': [100, 100, 100, 100, 100, 100]}\"
# e.g. ""max_cores=10 \n data_precision="float32" \n max_rows_feature_evolution=50000000000 \n ensemble_accuracy_switch=11 \n feature_engineering_effort=1 \n target_transformer="identity" \n tournament_feature_style_accuracy_switch=5 \n params_tensorflow="{'layers': [100, 100, 100, 100, 100, 100]}"""
# If you see: "toml.TomlDecodeError" then ensure toml is set correctly.
# When set in the expert page of an experiment, these changes only affect experiments and not the server
# Usually should keep this as empty string in this toml file.
#config_overrides = ''

##############################################################################
## Setup : Configure application server here (ip, ports, authentication, file
# types etc)

# IP address and port of autoviz process.
#vis_server_ip = "127.0.0.1"
#vis_server_port = 12346

# IP address and port of procsy process.
#procsy_ip = "127.0.0.1"
#procsy_port = 12347

# IP address and port of H2O instance.
#h2o_ip = "127.0.0.1"
#h2o_port = 54321

# IP address and port for Driverless AI HTTP server.
#ip = "127.0.0.1"
#port = 12345

# File upload limit (default 100GB)
#max_file_upload_size = 104857600000

# Verbosity of logging
# 0: quiet   (CRITICAL, ERROR, WARNING)
# 1: default (CRITICAL, ERROR, WARNING, INFO, DATA)
# 2: verbose (CRITICAL, ERROR, WARNING, INFO, DATA, DEBUG)
# Affects server and all experiments
#log_level = 1

# Whether to collect relevant server logs (h2oai_server.log, dai.log from systemctl or docker, and h2o log)
# Useful for when sending logs to H2O.ai
#collect_server_logs_in_experiment_logs = false

# Redis
#redis_ip = "127.0.0.1"
#redis_port = 6379
#master_redis_password = ""

# https settings
#
# You can make a self-signed certificate for testing with the following commands:
#
#     sudo openssl req -x509 -newkey rsa:4096 -keyout private_key.pem -out cert.pem -days 3650 -nodes -subj "/O=Driverless AI"
#     sudo chown dai:dai cert.pem private_key.pem
#     sudo chmod 600 cert.pem private_key.pem
#     sudo mv cert.pem private_key.pem /etc/dai
#
#enable_https = false
#ssl_key_file = "/etc/dai/private_key.pem"
#ssl_crt_file = "/etc/dai/cert.pem"

# SSL TLS
#ssl_no_sslv2 = true
#ssl_no_sslv3 = true
#ssl_no_tlsv1 = true
#ssl_no_tlsv1_1 = true
#ssl_no_tlsv1_2 = false
#ssl_no_tlsv1_3 = false

# Data directory. All application data and files related datasets and
# experiments are stored in this directory.

#data_directory = "./tmp"

# Whether to run quick performance benchmark at start of application
#enable_benchmark = true

# Whether to run quick startup checks at start of application
#enable_startup_checks = true

# Whether to opt in to usage statistics and bug reporting
#usage_stats_opt_in = false

# authentication_method
# unvalidated : Accepts user id and password. Does not validate password.
# none: Does not ask for user id or password. Authenticated as admin.
# openid: Users OpenID Connect provider for authentication. See additional OpenID settings below.
# pam: Accepts user id and password. Validates user with operating system.
# ldap: Accepts user id and password. Validates against an ldap server. Look
# for additional settings under LDAP settings.
# local: Accepts a user id and password. Validated against an htpasswd file provided in local_htpasswd_file.
# ibm_spectrum_conductor: Authenticate with IBM conductor auth api.
#authentication_method = "unvalidated"

# default amount of time in hours before we force user to login again (if not provided by authentication_method)
#authentication_default_timeout_hours = 72

# OpenID Connect Settings:
# base server uri to the OpenID Provider server (ex: http://localhost:7777)
#auth_openid_provider_base_uri=""
# uri to pull OpenID config data from (you can extract most of required OpenID config from this url)
# usually located at: /auth/realms/master/.well-known/openid-configuration
#auth_openid_configuration_uri=""
# uri to start authentication flow
#auth_openid_auth_uri=""
# uri to make request for token after callback from OpenID server was received
#auth_openid_token_uri=""
# uri to get user information once access_token has been acquired (ex: list of groups user belongs to will be provided here)
#auth_openid_userinfo_uri=""
# uri to logout user
#auth_openid_logout_uri=""
# callback uri that OpenID provide will use to send "authentication_code"
#auth_openid_redirect_uri=""
# OAuth2 grant type (usually acceess_token)
#auth_openid_grant_type=""
# OAuth2 response type (usually code)
#auth_openid_response_type=""
# Client ID registered with OpenID provider
#auth_openid_client_id=""
# Client secret provided by OpenID provider when registering Client ID
#auth_openid_client_secret=""
# Scope of info (usually openid)
#auth_openid_scope=""
# What key in user_info json should we check to authorize user
#auth_openid_userinfo_auth_key=""
# What value should the key have in user_info json in order to authorize user
#auth_openid_userinfo_auth_value=""
# Key that specifies username in user_info json (we will use it as username in our system)
#auth_openid_userinfo_username_key=""

# server_cookie_expiration_days
# Sets the time until expiration of secure cookie issued by server to client
# Cookie is issued upon login to Driverless AI UI and will expire 'n' days after that point
# If you wish cookies to expire in less than 1 day use decimals (1 day / 24 hours = 0.042 day/hour --> cookie expires in 1 hour)
# Expected behavior: if user is logged in, and cookie expires, the next click will redirect the user to the login page of the Driverless AI UI.
#server_cookie_expiration_days = 30

# LDAP Configuration
#ldap_server = ""    # ldap server domain or ip
#ldap_port = ""      # ldap server port
#ldap_bind_dn = ""  # Complete DN of the LDAP bind user
#ldap_bind_password = "" #Password for the LDAP bind
#ldap_tls_file = ""        # Provide Cert file location
#ldap_use_ssl = ""         # use true to use ssl or false
#ldap_search_base = ""     # the location in the DIT where the search will start
#ldap_search_filter = ""   # a string that describes what you are searching for
#ldap_search_attributes = "" # ldap attributes to return from search
#ldap_user_name_attribute ="uid" # specify key to find user name

# LDAP depricated settings
#ldap_recipe = "0"          # When using this recipe, needs to be set to "1"
#ldap_user_prefix = ""  # Deprecated do not use
#ldap_search_user_id = ""  # Depricated, Use ldap_bind_dn
#ldap_search_password = "" # Depricated, ldap_bind_password
#ldap_ou_dn = ""           # Deprecated, use ldap_search_base instead
#ldap_dc = ""               # Deprecated, use ldap_base_dn
#ldap_base_dn = ""           # Deprecated, use ldap_search_base
#ldap_base_filter = ""       # Deprecatedm use ldap_search_filter

# Local password file
# Generating a htpasswd file: see syntax below
# htpasswd -B "<location_to_place_htpasswd_file>" "<username>"
# note: -B forces use of brcypt, a secure encryption method
#local_htpasswd_file = ""

# Supported file formats (file name endings must match for files to show up in file browser)
#supported_file_types = "csv, tsv, txt, dat, tgz, gz, bz2, zip, xz, xls, xlsx, nff, jay, feather, bin, arff, parquet"

# File System Support
# upload : standard upload feature
# file : local file system/server file system
# hdfs : Hadoop file system, remember to configure the HDFS config folder path and keytab below
# dtap : Blue Data Tap file system, remember to configure the DTap section below
# s3 : Amazon S3, optionally configure secret and access key below
# gcs : Google Cloud Storage, remember to configure gcs_path_to_service_account_json below
# gbq : Google Big Query, remember to configure gcs_path_to_service_account_json below
# minio : Minio Cloud Storage, remember to configure secret and access key below
# snow : Snowflake Data Warehouse, remember to configure Snowflake credentials below (account name, username, password)
# kdb : KDB+ Time Series Database, remember to configure KDB credentials below (hostname and port, optionally: username, password, classpath, and jvm_args)
# azrbs : Azure Blob Storage, remember to configure Azure credentials below (account name, account key)
#enabled_file_systems = "upload, file, hdfs, s3"

# do_not_log_list : add configurations that you do not wish to be recorded in logs here

#do_not_log_list = "local_htpasswd_file, aws_access_key_id, aws_secret_access_key, snowflake_password, snowflake_url, snowflake_user, snowflake_account, minio_endpoint_url, minio_access_key_id, minio_secret_access_key, kdb_user, kdb_password, ldap_bind_password, gcs_path_to_service_account_json, azure_blob_account_name, azure_blob_account_key, deployment_aws_access_key_id, deployment_aws_secret_access_key, master_minio_access_key_id, master_minio_secret_access_key, master_redis_password, auth_openid_client_id, auth_openid_client_secret, auth_openid_userinfo_auth_key, auth_openid_userinfo_auth_value, auth_openid_userinfo_username_key"

# Minio is used for file distribution on multinode architecture
# These settings are used to specify the local Minio connection to master nodes
#master_minio_address = "<URL>:<PORT>"
#master_minio_access_key_id = ""
#master_minio_secret_access_key = ""

#allow_localstorage = true

##############################################################################
## Scoring Artifacts: Setup which scoring artifacts to generate by default

# Whether to create the Python scoring pipeline at the end of each experiment
#make_python_scoring_pipeline = true

# Whether to create the MOJO scoring pipeline at the end of each experiment
# Note: Not all transformers or main models are available for MOJO (e.g. no gblinear main model)
#make_mojo_scoring_pipeline = false


##############################################################################
## Hardware: Configure hardware settings here (GPUs, CPUs, Memory, etc.)

# Max number of CPU cores to use per experiment. Set to <= 0 to use all cores.
# One can also set environment variable "OMP_NUM_THREADS" to number of cores to use for OpenMP
# e.g. In bash: export OMP_NUM_THREADS=32 and export OPENBLAS_NUM_THREADS=32
#Set to -1 for all available cores.
#max_cores = -1

# Whether to set automatic number of cores by physical (true) or logical (false) count
# Using all logical cores can lead to poor performance due to cache thrashing
#max_cores_by_physical = true

# Absolute limit to core count
#max_cores_limit = 100

# Number of GPUs to use per experiment for training task.  Set to -1 for all GPUs.
# An experiment will generate many different models.
# Currently num_gpus_per_experiment!=-1 disables GPU locking, so is only recommended for
# single experiments and single users.
# Ignored if GPUs disabled or no GPUs on system.
# More info at: https://github.com/NVIDIA/nvidia-docker/wiki/nvidia-docker#gpu-isolation
#num_gpus_per_experiment = -1

# Number of GPUs to use per model training task.  Set to -1 for all GPUs.
# For example, when this is set to -1 and there are 4 GPUs available, all of them can be used for the training of a single model.
# Currently num_gpus_per_model!=1 disables GPU locking, so is only recommended for single
# experiments and single users.
# Ignored if GPUs disabled or no GPUs on system.
# More info at: https://github.com/NVIDIA/nvidia-docker/wiki/nvidia-docker#gpu-isolation
#num_gpus_per_model = 1

# Minimum number of threads for datatable (and OpenMP) during data munging
# datatable is the main data munging tool used within Driverless ai (source :
# https://github.com/h2oai/datatable)
#min_dt_threads_munging = 4

# Like min_datatable (and OpenMP)_threads_munging but for final pipeline munging
#min_dt_threads_final_munging = 4

# Which gpu_id to start with
# If using CUDA_VISIBLE_DEVICES=... to control GPUs (preferred method), gpu_id=0 is the
# first in that restricted list of devices.
# E.g. if CUDA_VISIBLE_DEVICES="4,5" then gpu_id_start=0 will refer to the
# device #4.
# E.g. from expert mode, to run 2 experiments, each on a distinct GPU out of 2 GPUs:
# Experiment#1: num_gpus_per_model=1, num_gpus_per_experiment=1, gpu_id_start=0
# Experiment#2: num_gpus_per_model=1, num_gpus_per_experiment=1, gpu_id_start=1
# E.g. from expert mode, to run 2 experiments, each on a distinct GPU out of 8 GPUs:
# Experiment#1: num_gpus_per_model=1, num_gpus_per_experiment=4, gpu_id_start=0
# Experiment#2: num_gpus_per_model=1, num_gpus_per_experiment=4, gpu_id_start=4
# E.g. Like just above, but now run on all 4 GPUs/model
# Experiment#1: num_gpus_per_model=4, num_gpus_per_experiment=4, gpu_id_start=0
# Experiment#2: num_gpus_per_model=4, num_gpus_per_experiment=4, gpu_id_start=4
# If num_gpus_per_model!=1, global GPU locking is disabled
# (because underlying algorithms don't support arbitrary gpu ids, only sequential ids),
# so must setup above correctly to avoid overlap across all experiments by all users
# More info at: https://github.com/NVIDIA/nvidia-docker/wiki/nvidia-docker#gpu-isolation
# Note that gpu selection does not wrap, so gpu_id_start + num_gpus_per_model must be less than number of visibile gpus
#gpu_id_start = 0

# Maximum number of workers for DriverlessAI server pool (only 1 needed
# currently)
#max_workers = 1

# Period (in seconds) of ping by DriverlessAI server to each experiment
# (in order to get logger info like disk space and memory usage)
# 0 means don't print anything
#ping_period = 60

# Minimum amount of disk space in GB needed to run experiments.
# Experiments will fail if this limit is crossed.
# This limit exists, because Driverless AI needs to generate data for model training
# feature engineering, documentation and other such processes.
# Disk space can also be used as an alternative to system memory (RAM).
#disk_limit_gb = 5

# Minimum amount of system memory in GB needed to start experiments
# Similarly with disk space, a certain amount of system memory is needed to run some basic
# operations.
#memory_limit_gb = 5

# Minimum number of rows needed to run experiments (values lower than 100
# might not work)
# A minimum threshold is set to ensure there is enough data to create a statistically
# reliable model and avoid other small-data related failures.
#min_num_rows = 100

# Minimum required number of rows (in the training data) for each class label for
# classification problems.
#min_rows_per_class = 5

# Minimum required number of rows for each split when generating validation samples.
#min_rows_per_split = 5

# Precision of how data is stored
# "float32" best for speed, "float64" best for accuracy or very large input values
# "float32" allows numbers up to about +-3E38 with relative error of about 1E-7
# "float64" allows numbers up to about +-1E308 with relative error of about 1E-16
# Some calculations, like the GLM standardization, can only handle up to sqrt() of these maximums for data values,
# So GLM with 32-bit precision can only handle up to about a value of 1E19 before standardization generates inf values.
# If you see "Best individual has invalid score" you may require higher precision.
#data_precision = "float32"

# Precision of most data transformers
# (Same options and notes as data_precision)
# Useful for higher precision in transformers with numerous operations that can accumulate error
# Also useful if want faster performance for transformers but otherwise want data stored in high precision
#transformer_precision = "float32"

# Whether to change ulimit soft limits up to hard limits (for DAI server app, which is not a generic user app)
# Prevents resource limit problems in some cases
# Restricted to no more than limit_nofile and limit_nproc for those resources
#ulimit_up_to_hard_limit = true

# number of file limit
# Below should be consistent with start-dai.sh
#limit_nofile=65535

# number of threads limit
# Below should be consistent with start-dai.sh
#limit_nproc=16384

##############################################################################
## Machine Learning: Configure machine learning configurations here
# (Data, Feature Engineering, Modelling etc)

# Seed for random number generator to make experiments reproducible (on same hardware), only active if 'reproducible' mode is enabled
#seed = 1234

# List of values that should be interpreted as missing values during data import. Applies both to numeric and string columns. Note that 'nan' is always interpreted as a missing value for numeric columns.
#missing_values = "['', '?', 'None', 'nan', 'NA', 'N/A', 'unknown', 'inf', '-inf', '1.7976931348623157e+308', '-1.7976931348623157e+308']"

# For tensorflow, what numerical value to give to missing values, where numeric values are standardized
# So 0 is center of distribution, and if Normal distribution then +-5 is 5 standard deviations away from the center.
# In many cases, an out of bounds value is a good way to represent missings, but in some cases the mean (0) may be better.
#tf_nan_impute_value = -5

# Internal threshold for number of rows x number of columns to trigger certain statistical
# techniques (small data recipe like including one hot encoding for all model types, and smaller learning rate)
# to increase model accuracy
#statistical_threshold_data_size_small = 100000

# Internal threshold for number of rows x number of columns to trigger certain statistical
# techniques (fewer genes created, removal of high max_depth for tree models, etc.) that can speed up modeling
# Also controls maximum rows used in training final model,
# by sampling statistical_threshold_data_size_large / columns number of rows
#statistical_threshold_data_size_large = 1000000000

# Internal threshold for number of rows x number of columns to trigger sampling for auxiliary data uses,
# like imbalanced data set detection and bootstrap scoring sample size and iterations
#aux_threshold_data_size_large = 10000000

# Internal threshold for number of rows x number of columns to trigger certain changes in performance
# (fewer threads if beyond large value) to help avoid OOM or unnecessary slowdowns
# (fewer threads if lower than small value) to avoid excess forking of tasks
#performance_threshold_data_size_small = 100000
#performance_threshold_data_size_large = 100000000

# Upper limit on the number of rows x number of columns for feature evolution (applies to both training and validation/holdout splits)
# feature evolution is the process that determines which features will be derived
# Depending on accuracy settings, a fraction of this value will be used
#feature_evolution_data_size = 100000000

# Maximum number of columns to start an experiment. This threshold exists to constraint the # complexity and the length of the Driverless AI's processes.
#max_cols = 10000

# Largest number of rows to use for column stats, otherwise sample randomly
#max_rows_col_stats = 1000000

# Maximum number of columns selected out of original set of original columns, using feature selection
# The selection is based upon how well target encoding (or frequency encoding if not available) on categoricals and numerics treated as categoricals
# This is useful to reduce the final model complexity. First the best
# [max_orig_cols_selected] are found through feature selection methods and then
# these features are used in feature evolution (to derive other features) and in modelling.
#max_orig_cols_selected = 10000

# Maximum number of numeric columns selected, above which will do feature selection
# same as above (max_orig_cols_selected) but for numeric columns.
#max_orig_numeric_cols_selected = 10000

# Maximum number of non-numeric columns selected, above which will do feature selection on all features and avoid num_as_cat
# same as above (max_orig_numeric_cols_selected) but for categorical columns.
#max_orig_nonnumeric_cols_selected = 500

# The factor times max_orig_cols_selected, by which column selection is based upon no target encoding and no num_as_cat
# in order to limit performance cost of feature engineering
#max_orig_cols_selected_simple_factor = 2

# Maximum allowed fraction of unique values for integer and categorical columns (otherwise will treat column as ID and drop)
#max_relative_cardinality = 0.95

# Maximum allowed number of unique values for integer and categorical columns (otherwise will treat column as ID and drop)
#max_absolute_cardinality = 1000000

# Whether to treat some numerical features as categorical
# For instance, sometimes an integer column may not represent a numerical feature but
# represent different numerical codes instead.
#num_as_cat = true

# Max number of unique values for integer/real columns to be treated as categoricals (test applies to first statistical_threshold_data_size_small rows only)
#max_int_as_cat_uniques = 50

# Number of folds for models used during the feature engineering process
# Increasing this will put a lower fraction of data into validation and more into training
# E.g. num_folds=3 means 67%/33% training/validation splits
# Actual value will vary for small or big data cases
#num_folds = 3

# Accuracy setting equal and above which enables full cross-validation (multiple folds) during feature evolution
# as opposed to only a single holdout split (e.g. 2/3 train and 1/3 validation holdout)
#full_cv_accuracy_switch = 8

# Accuracy setting equal and above which enables stacked ensemble as final model
# Stacking commences at the end of the feature evolution process.
# It quite often leads to better model performance, but it does increase the complexity
# and execution time of the final model.
#ensemble_accuracy_switch = 5

# Fixed ensemble_level
# -1 = auto, based upon ensemble_accuracy_switch, accuracy, size of data, etc.
# 0 = No ensemble, only final single model on validated iteration/tree count
# 1 = 1 model, multiple ensemble folds (cross-validation)
# 2 = 2 models, multiple ensemble folds (cross-validation)
# 3 = 3 models, multiple ensemble folds (cross-validation)
# 4 = 4 models, multiple ensemble folds (cross-validation)
#fixed_ensemble_level = -1

# Number of fold splits to use for ensemble_level >= 2
# The ensemble modelling may require predictions to be made on out-of-fold samples
# hence the data needs to be split on different folds to generate these predictions.
# Less folds (like 2 or 3) normally create more stable models, but may be less accurate
# More folds can get to higher accuracy at the expense of more time, but the performance
# may be less stable when the training data is not enough (i.e. higher chance of overfitting).
# Actual value will vary for small or big data cases
#num_ensemble_folds = 5

# Number of repeats for each fold for all validation
# (modified slightly for small or big data cases)
#fold_reps = 1

# For binary classification: ratio of majority to minority class equal and above which to enable undersampling
# This option helps to deal with imbalance (on the target variable)
#imbalance_ratio_undersampling_threshold = 5

# Quantile-based sampling method for imbalanced binary classification (only if class ratio is above the threshold provided above)
# Model on data is used to create deciles of predictions, and then each decile is sampled from uniformly.
#quantile_imbalanced_sampling = false

# Maximum number of classes to allow for a multi-classification problem.
# High number of classes may make certain processes of Driverless AI time-consuming.
# Memory requirements also increase with higher number of classes
#max_num_classes = 100

# Number of actuals vs. predicted data points to use in order to generate in the relevant
# plot/graph which is shown at the right part of the screen within an experiment.
#num_actuals_vs_predicted = 100

# Whether to use H2O.ai brain, the local caching and smart re-use of prior models to generate features for new models
# This variable essentially controls how much information we store about the different
# models generated and different features explored while running an experiment. It can help # with checkpointing and retrieving experiments that have been paused or interrupted.
#  Will use H2O.ai brain cache if cache file has no extra column names per column type,
#  cache exactly matches classes, class labels, and time series options,
#  interpretability of cache is equal or lower,
#  main model (booster) is allowed by new experiment
# Level of brain to use (for chosen level, where higher levels will also do all lower level operations automatically)
# -1 = Don't use any brain cache and don't write any cache
#  0 = Don't use any brain cache but still write cache
#      Use case: Want to save model for later use, but want current model to be built without any brain models
#  1 = smart checkpoint if passed in old experiment_id to pull from (via GUI, running "restart from checkpoint" or chose which experiment to resume from)
#      Use case: From GUI select prior experiment using the right-hand panel, and select "RESTART FROM LAST CHECKPOINT" to use specific experiment's model to build new models from
#  2 = smart checkpoint from H2O.ai brain cache of individual best models
#      Use case: No need to select a particular prior experiment, we scan through H2O.ai brain cache for best models to restart from
#  3 = smart checkpoint like level #1, but for entire population.  Tune only if brain population insufficient size
#      (will re-score entire population in single iteration, so appears to take longer to complete first iteration)
#  4 = smart checkpoint like level #2, but for entire population.  Tune only if brain population insufficient size
#      (will re-score entire population in single iteration, so appears to take longer to complete first iteration)
#  5 = like #4, but will scan over entire brain cache of populations to get best scored individuals, starting from resumed experiment if chosen.
#      (can be slower due to brain cache scanning if big cache)
# Other use cases:
# a) Restart on different data: Use same column names and fewer or more rows (applicable to 1 - 5)
# b) Re-fit only final pipeline: Like (a), but choose time=1 and feature_brain_level=3 - 5
# c) Restart with more columns: Add columns, so model builds upon old model built from old column names (1 - 5)
# d) Restart with focus on model tuning: Restart, then select feature_engineering_effort = 3 in expert settings
# Notes:
# 1) For Restart cases, may want to set min_dai_iterations to non-zero to force delayed early stopping, else may not be enough iterations to find better model.
# 2) A "Restart from last checkpoint" of a Re-fit will fail to find cache and re-start fresh experiment
# 3) A "New model with Same Params" of a Re-fit will fail to find cache and re-start fresh experiment
# 4) A "New model with Same Params" of a Restart will use feature_brain_level=3 for default Restart mode (revert to 2, or even 0 if want to start a fresh experiment otherwise)
#feature_brain_level = 2

# Maximum number of brain individuals pulled from H2O.ai brain cache for feature_brain_level=1, 2
#max_num_brain_indivs = 3

# Directory, relative to data_directory, to store H2O.ai brain meta model files
#brain_rel_dir = "H2O.ai_brain"

# Maximum size in bytes the brain will store
# We reserve this memory to save data in order to ensure we can retrieve an experiment if
# for any reason it gets interrupted.
# -1: unlimited
# >=0 number of GB to limit brain to
#brain_max_size_GB = 20

# Whether to enable early stopping
# Early stopping refers to stopping the feature evolution/engineering process
# when there is no performance uplift after a certain number of iterations.
# After early stopping has been triggered, Driverless AI will initiate the ensemble
# process if selected.
#early_stopping = true

# Minimum number of Driverless AI iterations to stop the feature evolution/engineering
# process even if score is not improving. Driverless AI needs to run for at least that many
# iterations before deciding to stop. It can be seen a safeguard against suboptimal (early)
# convergence.
#min_dai_iterations = 0

# Maximum features per model (and each model within the final model if ensemble) kept just after scoring them
# Keeps top varaible importance features, prunes rest away, after each scoring.
# Final ensemble will exclude any pruned-away features and only train on kept features,
#   but may contain a few new features due to fitting on different data view (e.g. new clusters)
# Final scoring pipeline will exclude any pruned-away features,
#   but may contain a few new features due to fitting on different data view (e.g. new clusters)
# -1 means no restrictions except internally-determined memory restrictions
#nfeatures_max = -1

# Recipe type
# Recipes override any GUI settings
# 'auto' : all models and features automatically determined by experiment settings, toml settings, and feature_engineering_effort
# 'compliant' : like 'auto' except:
#
# interpretability=10 (to avoid complexity, overrides GUI or python client chose for interpretability)
# enable_glm='on' (rest 'off', to avoid complexity and be compatible with algorithms supported by MLI)
# num_as_cat=false: don't convert any numerics to categoricals except via one-hot encoding (to avoid complexity)
# fixed_ensemble_level=0: Don't use any ensemble (to avoid complexity)
# feature_brain_level=0: No feature brain used (to ensure every restart is identical)
# max_feature_interaction_depth=1: interaction depth is set to 1 (no multi-feature interactions to avoid complexity)
# target_transformer='identity': for regression (to avoid complexity)
# check_distribution_shift=false: Don't use distribution shift between train, valid, and test to drop features (bit risky without fine-tuning)
#recipe = 'auto'

# How much effort to spend on feature engineering (0...10)
# Heuristic combination of various developer-level toml parameters
# 0   : keep only numeric features, only model tuning during evolution
# 1   : keep only numeric features and frequency-encoded categoricals, only model tuning during evolution
# 2   : Like #1 but instead just no Text features.  Some feature tuning before evolution.
# 3   : Like #5 but only tuning during evolution.  Mixed tuning of features and model parameters.
# 4   : Like #5, but slightly more focused on model tuning
# 5   : Default.  Balanced feature-model tuning
# 6-7 : Like #5, but slightly more focused on feature engineering
# 8   : Like #6-7, but even more focused on feature engineering with high feature generation rate, no feature dropping even if high interpretability
# 9-10: Like #8, but no model tuning during feature evolution
#feature_engineering_effort = 5

# Threshold for average string-is-text score as determined by internal heuristics
# It decides when a string column will be treated as text (for an NLP problem) or just as
# a standard categorical variable.
# Higher values will favor string columns as categoricals, lower values will favor string columns as text
#string_col_as_text_threshold = 0.3

# Mininum fraction of unique values for string columns to be considered as possible text (otherwise categorical)
#string_col_as_text_min_relative_cardinality = 0.1

# Mininum number of uniques for string columns to be considered as possible text (otherwise categorical)
#string_col_as_text_min_absolute_cardinality = 100

# Interpretability setting equal and above which will use monotonicity constraints in GBM
# You may read the following source to understand what these constraints connote and why
# they may be important, especially when the end goal is a very interpretable machine
# learning model: https://blog.datadive.net/monotonicity-constraints-in-machine-learning/
#monotonicity_constraints_interpretability_switch = 7

# Exploring feature interactions can be important in gaining better predictive performance.
# The interaction can take multiple forms (i.e. feature1 + feature2 or feature1 * feature2 + ... featureN)
# Although certain machine learning algorithms (like tree-based methods) can do well in
#capturing these interactions as part of their training process, still generating them may
# help them (or other algorithms) yield better performance.
# The depth of the interaction level (as in "u"p to"" how many features may be combined at
# once to create one single feature) can be specified to control the complexity of the
# feature engineering process. Higher values might be able to make more predictive models
# # at the expense of time.
#max_feature_interaction_depth = 8

# Accuracy setting equal and above which enables tuning of model parameters
# Only applicable if parameter_tuning_num_models=-1 (auto)
#tune_parameters_accuracy_switch = 3

# Number of models to tune during pre-evolution phase
# Can make this lower to avoid excessive tuning, or make higher to do enhanced tuning
# -1 : auto
#parameter_tuning_num_models = -1

# Accuracy setting equal and above which enables tuning of target transform for regression
# This is useful for time series when instead of predicting the actual target value, it
# might be better to predict a transformed target variable like sqrt(target) or log(target)
# as a means to control for outliers.
#tune_target_transform_accuracy_switch = 3

# Whether to automatically select target transformation for regression problems
# Can choose: 'identity' to disable any transformation, otherwise use 'auto'
#target_transformer = 'auto'

# Tournament style (method to decide which models are best at each iteration)
# "auto" : Choose based upon accuracy, etc.
# "fullstack" : Choose among optimal model and feature types
# "uniform" : all individuals in population compete to win as best
# "model" : individuals with same model type compete
# "feature" : individuals with similar feature types compete
# "model" and "feature" styles preserve at least one winner for each type (and so 2 total indivs of each type after mutation)
# For each case, a round robin approach is used to choose best scores among type of models to choose from
#tournament_style = "auto"

# Interpretability above which will use "uniform" tournament style
#tournament_uniform_style_interpretability_switch = 6

# Accuracy below which will use uniform style if tournament_style = "auto" (regardless of other accuracy tournament style switch values)
#tournament_uniform_style_accuracy_switch = 6

# Accuracy equal and above which uses model style if tournament_style = "auto"
#tournament_model_style_accuracy_switch = 6

# Accuracy equal and above which uses feature style if tournament_style = "auto"
#tournament_feature_style_accuracy_switch = 7

# Accuracy equal and above which uses fullstack style if tournament_style = "auto"
#tournament_fullstack_style_accuracy_switch = 8

# Driverless AI uses a genetic algorithm (GA) to find the best features, best models and
# best hyper parameters for these models. The GA facilitates getting good results while not
# requiring torun/try every possible model/feature/parameter. This version of GA has
# reinforcement learning elements - it uses a form of exploration-exploitation to reach
# optimum solutions. This means it will capitalise on models/features/parameters that seem # to be working well and continue to exploit them even more, while allowing some room for
# trying new (and semi-random) models/features/parameters to avoid settling on a local
# minimum.
# These models/features/parameters tried are what-we-call individuals of a population. More # individuals connote more models/features/parameters to be tried and compete to find the best # ones.
#num_individuals = 2

# set fixed number of individuals (if > 0) - useful to compare different hardware configurations
#fixed_num_individuals = 0

# set fixed number of folds (if > 0) when using cross-validation. It may be useful for
# quick runs regardless of the data size
#fixed_num_folds = 0

# set fixed number of fold reps (if > 0) - useful for quick runs regardless of data
#fixed_fold_reps = 0

# set true to force only first fold for models - useful for quick runs regardless of data
#fixed_only_first_fold_model = false

# number of unique targets or folds counts after which switch to faster/simpler non-natural sorting and print outs
#sanitize_natural_sort_limit = 1000

# Whether target encoding is generally enabled
# Target encoding refers to several different feature transformations (primarily focused on
# categorical data) that aim to represent the feature using information of the actual
# target variable. A simple example can be to use the mean of the target to replace each
# unique category of a categorical feature. This type of features can be very predictive,
# but are prone to overfitting and require more memory as they need to store mappings of
# the unique categories and the target values.
#enable_target_encoding = true

# Driverless AI categorises all data (feature engineering) transformers
# More information for these transformers can be viewed here:
# http://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/transformations.html
# This section allows excluding/blocking these transformations and may be useful when
# simpler (more interpretable) models are sought at the expense of accuracy.
# the interpretability setting)
# for multi-class: "['NumCatTETransformer', 'TextLinModelTransformer',
# 'FrequentTransformer', 'CVTargetEncodeF', 'ClusterDistTransformer',
# 'WeightOfEvidenceTransformer', 'TruncSVDNumTransformer', 'CVCatNumEncodeF',
# 'DatesTransformer', 'TextTransformer', 'FilterTransformer',
# 'NumToCatWoETransformer', 'NumToCatTETransformer', 'ClusterTETransformer',
# 'BulkInteractionsTransformer']"
#
# for regression/binary: "['TextTransformer', 'ClusterDistTransformer',
# 'FilterTransformer', 'TextLinModelTransformer', 'NumToCatTETransformer',
# 'DatesTransformer', 'WeightOfEvidenceTransformer', 'BulkInteractionsTransformer',
# 'FrequentTransformer', 'CVTargetEncodeF', 'NumCatTETransformer',
# 'NumToCatWoETransformer', 'TruncSVDNumTransformer', 'ClusterTETransformer',
# 'CVCatNumEncodeF']"
#
# This list appears in the experiment logs (search for "Transformers used")
# e.g. to disable all Target Encoding: exclude_transformers =
# "['NumCatTETransformer', 'CVTargetEncodeF', 'NumToCatTETransformer',
# 'ClusterTETransformer']"
#exclude_transformers = ""

# Exclude list of genes (i.e. genes (built on top of transformers) to not use,
# independent of the interpretability setting)
# Some transformers are used by multiple genes, so this allows different control over feature engineering
#
# for multi-class: "['BulkInteractionsGene', 'WeightOfEvidenceGene',
# 'NumToCatTargetEncodeSingleGene', 'FilterGene', 'TextGene', 'FrequentGene',
# 'NumToCatWeightOfEvidenceGene', 'NumToCatWeightOfEvidenceMonotonicGene', '
# CvTargetEncodeSingleGene', 'DateGene', 'NumToCatTargetEncodeMultiGene', '
# DateTimeGene', 'TextLinRegressorGene', 'ClusterIDTargetEncodeSingleGene',
# 'CvCatNumEncodeGene', 'TruncSvdNumGene', 'ClusterIDTargetEncodeMultiGene',
# 'NumCatTargetEncodeMultiGene', 'CvTargetEncodeMultiGene', 'TextLinClassifierGene',
# 'NumCatTargetEncodeSingleGene', 'ClusterDistGene']"
#
# for regression/binary: "['CvTargetEncodeSingleGene', 'NumToCatTargetEncodeSingleGene',
# 'CvCatNumEncodeGene', 'ClusterIDTargetEncodeSingleGene', 'TextLinRegressorGene',
# 'CvTargetEncodeMultiGene', 'ClusterDistGene', 'FilterGene', 'DateGene',
# 'ClusterIDTargetEncodeMultiGene', 'NumToCatTargetEncodeMultiGene',
# 'NumCatTargetEncodeMultiGene', 'TextLinClassifierGene', 'WeightOfEvidenceGene',
# 'FrequentGene', 'TruncSvdNumGene', 'BulkInteractionsGene', 'TextGene',
# 'DateTimeGene', 'NumToCatWeightOfEvidenceGene',
# 'NumToCatWeightOfEvidenceMonotonicGene', ''NumCatTargetEncodeSingleGene']"
#
# This list appears in the experiment logs (search for "Genes used")
# e.g. to disable bulk interaction gene, use:  exclude_genes =
#"['BulkInteractionsGene']"
#exclude_genes = ""

# Parameters for LightGBM to override DAI parameters
# parameters should be given as XGBoost equivalent unless unique LightGBM parameter
# e.g. 'eval_metric' instead of 'metric' should be used
# e.g. params_lightgbm = "{'objective': 'binary:logistic', 'n_estimators': 100, 'max_leaves': 64, 'random_state': 1234}"
# e.g. params_lightgbm = {'n_estimators': 600, 'learning_rate': 0.1, 'reg_alpha': 0.0, 'reg_lambda': 0.5, 'gamma': 0, 'max_depth': 0, 'max_bin': 128, 'max_leaves': 256, 'scale_pos_weight': 1.0, 'max_delta_step': 3.469919910597877, 'min_child_weight': 1, 'subsample': 0.9, 'colsample_bytree': 0.3, 'tree_method': 'gpu_hist', 'grow_policy': 'lossguide', 'min_data_in_bin': 3, 'min_child_samples': 5, 'early_stopping_rounds': 20, 'num_classes': 2, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'random_state': 987654, 'early_stopping_threshold': 0.01, 'monotonicity_constraints': False, 'silent': True, 'debug_verbose': 0, 'subsample_freq': 1}"
# avoid including "system"-level parameters like 'n_gpus': 1, 'gpu_id': 0, , 'n_jobs': 1, 'booster': 'lightgbm'
# also likely should avoid parameters like: 'objective': 'binary:logistic', unless one really knows what one is doing (e.g. alternative objectives)
# See: https://xgboost.readthedocs.io/en/latest/parameter.html
# And see: https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst
# Can also pass objective parameters if choose (or in case automatically chosen) certain objectives
# https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric-parameters
#params_lightgbm = "{}"

# Parameters for XGBoost to override DAI parameters
# similar parameters as lightgbm since lightgbm parameters are transcribed from xgboost equivalent versions
# e.g. params_xgboost = "{'n_estimators': 100, 'max_leaves': 64, 'max_depth': 0, 'random_state': 1234}"
# See: https://xgboost.readthedocs.io/en/latest/parameter.html
#params_xgboost = "{}"

# Like params_xgboost but for XGBoost's dart method
#params_dart = "{}"

# Parameters for Tensorflow to override DAI parameters
# e.g. params_tensorflow = "{'lr': 0.01, 'add_wide': False, 'add_attention': True, 'epochs': 30, 'layers': [100, 100], 'activation': 'selu', 'batch_size': 64, 'chunk_size': 1000, 'dropout': 0.3, 'strategy': 'one_shot', 'l1': 0.0, 'l2': 0.0, 'ort_loss': 0.5, 'ort_loss_tau': 0.01, 'normalize_type': 'streaming'}"
# See: https://keras.io/ , e.g. for activations: https://keras.io/activations/
# Example layers: [500, 500, 500], [100, 100, 100], [100, 100], [50, 50]
# Strategies: '1cycle' or 'one_shot', See: https://github.com/fastai/fastai
# normalize_type: 'streaming' or 'global' (using sklearn StandardScaler)
#params_tensorflow = "{}"

# Parameters for XGBoost's gblinear to override DAI parameters
# e.g. params_gblinear = "{'n_estimators': 100}"
# See: https://xgboost.readthedocs.io/en/latest/parameter.html
#params_gblinear = "{}"

# Parameters for Rulefit to override DAI parameters
# e.g. params_rulefit = "{'max_leaves': 64}"
# See: https://xgboost.readthedocs.io/en/latest/parameter.html
#params_rulefit = "{}"

# Parameters for FTRL to override DAI parameters
#params_ftrl = "{}"

# Dictionary of key:lists of values to use for LightGBM tuning, overrides DAI's choice per key
# e.g. params_tune_lightgbm = "{'min_child_samples': [1,2,5,100,1000], 'min_data_in_bin': [1,2,3,10,100,1000]}"
#params_tune_lightgbm = "{}"

# Like params_tune_lightgbm but for XGBoost
# e.g. params_tune_xgboost = "{'max_leaves': [8, 16, 32, 64]}"
#params_tune_xgboost = "{}"

# Like params_tune_lightgbm but for XGBoost's Dart
# e.g. params_tune_dart = "{'max_leaves': [8, 16, 32, 64]}"
#params_tune_dart = "{}"

# Like params_tune_lightgbm but for TensorFlow
# e.g. params_tune_tensorflow = "{'layers': [[10,10,10], [10, 10, 10, 10]]}"
#params_tune_tensorflow = "{}"

# Like params_tune_lightgbm but for gblinear
# e.g. params_tune_gblinear = "{'reg_lambda': [.01, .001, .0001, .0002]}"
#params_tune_gblinear = "{}"

# Like params_tune_lightgbm but for rulefit
# e.g. params_tune_rulefit = "{'max_depth': [4, 5, 6]}"
#params_tune_rulefit = "{}"

# Like params_tune_lightgbm but for ftrl
#params_tune_ftrl = "{}"

# Whether to force max_leaves and max_depth to be 0 if grow_policy is depthwise and lossguide, respectively.
#params_tune_grow_policy_simple_trees = true

# Whether to enable XGBoost models (auto/on/off)
#enable_xgboost = "auto"

# Internal threshold for number of rows x number of columns to trigger no xgboost models due to high memory use
# Overridden if enable_xgboost = "on", in which case always allow xgboost to be used
#xgboost_threshold_data_size_large = 100000000

# Internal threshold for number of rows x number of columns to trigger no xgboost models due to limits on GPU memory capability
# Overridden if enable_xgboost = "on", in which case always allow xgboost to be used
#xgboost_gpu_threshold_data_size_large = 30000000

# Whether to enable GLM models (auto/on/off)
#enable_glm = "auto"

# Whether to enable LightGBM models (auto/on/off)
#enable_lightgbm = "auto"

# Whether to enable Random Forest (in LightGBM package) models (auto/on/off/only)
#enable_rf = "auto"

# Whether to enable TensorFlow models (beta version, no mojo) (auto/on/off)
#enable_tensorflow = "off"

# Whether to enable RuleFit support (beta version, no mojo) (auto/on/off)
#enable_rulefit = "off"

# Whether to enable FTRL support (beta version, no mojo) (follow the regularized leader) model (auto/on/off)
#enable_ftrl = "off"

# Maximum number of GBM trees or GLM iterations
# Early-stopping usually chooses less
#max_nestimators = 3000

# factor by which max_nestimators is reduced for tuning and feature evolution
#max_nestimators_feature_evolution_factor = 0.2

# Maximum tree depth (and corresponding max max_leaves as 2**max_max_depth)
#max_max_depth = 12

# Default max_bin for tree methods
#default_max_bin = 256

# Default max_bin for lightgbm (recommended for GPU lightgbm)
#default_lightgbm_max_bin = 64

# Maximum max_bin for any tree
#max_max_bin = 256

# Minimum max_bin for any tree
#min_max_bin = 32

# Amount of memory which can handle max_bin = 256 can handle 125 columns and max_bin = 32 for 1000 columns
# As available memory on system goes higher than this scale, can handle proportionally more columns at higher max_bin
# Currently set to 10GB
#scale_mem_for_max_bin = 10737418240

# Factor by which rf gets more depth than gbdt
#factor_rf = 1.5

# Upper limit on learning rate for GBM models
# If want to override min_learning_rate and min_learning_rate_final, set this to smaller value
#max_learning_rate = 0.5

# Lower limit on learning rate for feature engineering GBM models
#min_learning_rate = 0.05

# Lower limit on learning rate for final ensemble GBM models
#min_learning_rate_final = 0.01

# Max. number of epochs for TensorFlow models
#tensorflow_max_epochs = 10

# Whether tensorflow will use all CPU cores, or if it will split among all transformers
#tensorflow_use_all_cores = true

# Whether tensorflow will use all CPU cores if reproducible is set, or if it will split among all transformers
#tensorflow_use_all_cores_even_if_reproducible_true = false

# Max. number of epochs for TensorFlow models for making NLP features
#tensorflow_max_epochs_nlp = 2

# Accuracy setting equal and above which will add all enabled TensorFlow NLP models below at start of experiment for text dominated problems
#enable_tensorflow_nlp_accuracy_switch = 5

# Whether to use Word-based CNN TensorFlow models for NLP if tensorflow enabled
#enable_tensorflow_textcnn = false

# Whether to use Word-based Bi-GRU TensorFlow models for NLP if tensorflow enabled
#enable_tensorflow_textbigru = false

# Whether to use Character-level CNN TensorFlow models for NLP if tensorflow enabled
#enable_tensorflow_charcnn = false

# Max number of rules to be used for RuleFit models (-1 for all)
#rulefit_max_num_rules = -1

# Max tree depth for RuleFit models
#rulefit_max_tree_depth = 6

# Max number of trees for RuleFit models
#rulefit_max_num_trees = 100

# Internal threshold for number of rows x number of columns to trigger no rulefit models due to being too slow currently
#rulefit_threshold_data_size_large = 100000000

# Enable One-Hot-Encoding (which does binning to limit to number of bins to no more than 100 anyway) for categorical columns with fewer than this many unique values
# Set to 0 to disable
#one_hot_encoding_cardinality_threshold = 50

# list of possible bins for target encoding (first is default value)
#te_bin_list = [25, 10, 100, 250]

# list of possible bins for weight of evidence encoding (first is default value)
# If only want one value: woe_bin_list = [2]
#woe_bin_list = [25, 10, 100, 250]

# list of possible bins for ohe hot encoding (first is default value)
#ohe_bin_list = [10, 25, 50, 75, 100]

# Enable time series recipe
#time_series_recipe = true

# earliest datetime for automatic conversion of integers in %Y%m%d format to a time column during parsing
#min_ymd_timestamp = 19700101

# lastet datetime for automatic conversion of integers in %Y%m%d format to a time column during parsing
#max_ymd_timestamp = 20300101

# maximum number of data samples (randomly selected rows) for date/datetime format detection
#max_rows_datetime_format_detection = 100000

# Whether to enable train/valid and train/test distribution shift detection
#check_distribution_shift = true

# Normalized training variable importance above which to check the feature for shift
# Useful to avoid checking likely unimportant features
#shift_key_features_varimp = 0.01

# Whether to only check certain features based upon the value of shift_key_features_varimp
#check_reduced_features = true

# Number of trees to use to train model to check shift in distribution
# No larger than max_nestimators
#shift_trees = 100

# The value of max_bin to use for trees to use to train model to check shift in distribution
#shift_max_bin = 256

# The value of max_depth to use for trees to use to train model to check shift in distribution
#shift_max_depth = 4

# If distribution shift detection is enabled, show features for which shift AUC is above this value
# (AUC of a binary classifier that predicts whether given feature value belongs to train or test data)
#detect_features_distribution_shift_threshold_auc = 0.55

# If distribution shift detection is enabled, drop features for which shift AUC is above this value
# (AUC of a binary classifier that predicts whether given feature value belongs to train or test data)
#drop_features_distribution_shift_threshold_auc = 0.6

# Minimum number of features to keep, keeping least shifted feature at least if 1
#drop_features_distribution_shift_min_features = 1

# Whether to enable detailed traces (in GUI Trace)
#detailed_traces = false

# How close to the optimal value (usually 1 or 0) does the validation score need to be to be considered perfect (to stop the experiment)?
#abs_tol_for_perfect_score = 1e-4

#############################################################################
##Time Series settings

# Normalized probability of choosing to lag non-targets relative to targets
#prob_lag_non_targets = 0.1

# Unnormalized probability of choosing other lag time-series transformers based on interactions
#prob_lagsinteraction = 0.1

# Unnormalized probability of choosing other lag time-series transformers based on aggregations
#prob_lagsaggregates = 0.1

# Automatically generate is-holiday features from date columns
#holiday_features = true

# County code to use to look up holiday calendar (Python package 'holiday')
#holiday_country = "US"

# Max. sample size for automatic determination of time series train/valid split properties, only if time column is selected
#max_time_series_properties_sample_size = 1000000

# Maximum number of lag sizes, which are sampled from if sample_lag_sizes==true, else all are taken (-1 == automatic)
#max_lag_sizes = -1

# Minimum required autocorrelation threshold for a lag to be considered for feature engineering
#min_lag_autocorrelation = 0.1

# How many samples of lag sizes to use for a single time group (single time series signal)
#max_signal_lag_sizes = 100

# Whether to sample lag sizes
#sample_lag_sizes = false

# Probability for new Lags/EWMA gene to use default lags (determined by frequency/gap/horizon, independent on data)
#prob_default_lags = 0.2

# How many samples of lag sizes to use, chosen randomly out of original set of lag sizes
#max_sampled_lag_sizes = 10

# Override lags to be used
# e.g. [7, 14, 21] # this exact list
# e.g. 21 # produce from 1 to 21
# e.g. 21:3 produce from 1 to 21 in step of 3
# e.g. 5-21 produce from 5 to 21
# e.g. 5-21:3 produce from 5 to 21 in step of 3
#override_lag_sizes = []

# Whether to consider time groups columns as potential features
#allow_tgc_memorization = false

# Maximum time t spent to generate training holdout predictions
# t < 0: up to 1440 minutes (24h) spent for generating training holdout predictions
# t = 0: no training holdout predictions
# t > 0: up to t minutes spent for generating training holdout predictions
#time_series_holdout_predictions_timebank = 0

##################################################################################
##MLI ( Machine Learning Intepretability) settings
# When number of rows are above this limit sample for MLI for scoring UI data
#mli_sample_above_for_scoring = 1000000

# When number of rows are above this limit sample for MLI for training surrogate models
#mli_sample_above_for_training = 100000

# When sample for MLI how many rows to sample
#mli_sample_size = 100000

# how many bins to do quantile binning
#mli_num_quantiles = 10

# mli random forest number of trees
#mli_drf_num_trees = 100

# Whether to speed up predictions used inside MLI with a fast approximation
#mli_fast_approx = true

# mli number of trees for fast_approx during predict for Shapley
#fast_approx_num_trees = 50

# whether to do only 1 fold and 1 model of all folds and models if ensemble
#fast_approx_do_one_fold_one_model = true

# mli random forest max depth
#mli_drf_max_depth = 20

# not only sample training, but also sample scoring
#mli_sample_training = true

# regularization strength for k-LIME GLM's
#klime_lambda = [1e-6, 1e-8]
#klime_alpha = 0.0

# mli converts numeric columns to enum when cardinality is <= this value
#mli_max_numeric_enum_cardinality = 25

# Maximum number of features allowed for k-LIME k-means clustering
#mli_max_number_cluster_vars = 6

#Use all columns for k-LIME k-means clustering (this will override `mli_max_number_cluster_vars` if set to `true`
#use_all_columns_klime_kmeans = false

#Strict version check for MLI
#mli_strict_version_check = true

#MLI cloud name
#mli_cloud_name = ""

##############################################################################
## Machine Learning Output : What kinds of files are written related to the machine learning process

# Whether to dump every scored individual's variable importance (both derived and original) to csv/tabulated/json file
# produces files like: individual_scored_id%d.iter%d*features*
#dump_varimp_every_scored_indiv = false

# Whether to dump every scored individual's model parameters to csv/tabulated file
# produces files like: individual_scored_id%d.iter%d*params*
#dump_modelparams_every_scored_indiv = false

# Whether to append (false) or have separate files (true) for modelparams every scored indiv
#dump_modelparams_separate_files = false

# Location of the AutoDoc template
#autodoc_template = "report_template.docx"

# Whether to compute training, validation, and test correlation matrix (table and heatmap pdf) and save to disk
# alpha: currently single threaded and slow for many columns
#compute_correlation = false

# Whether to dump to disk a correlation heatmap
#produce_correlation_heatmap = false

# Value to report high correlation between original features
#high_correlation_value_to_report = 0.95

# Whether to dump to *timings.txt files timing for each transformer
#write_trans_timings = true

# whether to delete preview timings if wrote transformer timings
#delete_preview_trans_timings = true

# Whether to delete preview cache on server exit
#preview_cache_upon_server_exit = true


##############################################################################
## Connectors : Configure connector specifications here
#
# Instance Local file system
# The option disable access to DAI data_directory from file browser
#file_hide_data_directory = true
# The option specify include only list of absolute path prefixes
# which will be only accessible in file browser.
# For example:
# file_path_filter_include = "['/data','/home/michal/']"
#file_path_filter_include = "[]"

# Enable usage of path filters
#file_path_filtering_enabled = false

## HDFS
## Note that if using Kerberos, be sure that the DAI time
## is synched with the Kerberos server.

# Configurations for a HDFS data source
# Path of hdfs coresite.xml
# core_site_xml_path is deprecated, please use hdfs_config_path
#core_site_xml_path = ""

# HDFS config folder path , can contain multiple config files
#hdfs_config_path = ""

# Path of the principal key tab file
#key_tab_path = ""

# HDFS connector
# Auth type can be Principal/keytab/keytabPrincipal
# Specify HDFS Auth Type, allowed options are:
#   noauth : No authentication needed
#   principal : Authenticate with HDFS with a principal user
#   keytab : Authenticate with a Key tab (recommended). If running
#             DAI as a service, then the Kerberos keytab needs to
#             be owned by the DAI user.
#   keytabimpersonation : Login with impersonation using a keytab
#hdfs_auth_type = "noauth"

# Kerberos app principal user (recommended)
#hdfs_app_principal_user = ""

# Deprecated: Do not use hdfs_app_login_user, user name is taken from user login
#hdfs_app_login_user = ""

# JVM args for HDFS distributions, provide args seperate by space
# -Djava.security.krb5.conf=<path>/krb5.conf
# -Dsun.security.krb5.debug=true
# -Dlog4j.configuration=file:///<path>log4j.properties
#hdfs_app_jvm_args = ""
# hdfs class path
#hdfs_app_classpath = ""
# Limit files returned from HDFS
#hdfs_max_files_listed = 100

# Blue Data DTap connector settings are similar to HDFS connector settings.
#
# Specify DTap Auth Type, allowed options are:
#   noauth : No authentication needed
#   principal : Authenticate with DTab with a principal user
#   keytab : Authenticate with a Key tab (recommended). If running
#             DAI as a service, then the Kerberos keytab needs to
#             be owned by the DAI user.
#   keytabimpersonation : Login with impersonation using a keytab
#
# NOTE: "hdfs_app_classpath" and "core_site_xml_path" are both required to be set for DTap connector
#dtap_auth_type = "noauth"
# Dtap (HDFS) config folder path , can contain multiple config files
#dtap_config_path = ""
# Path of the principal key tab file
#dtap_key_tab_path = ""
# Kerberos app principal user (recommended)
#dtap_app_principal_user = ""
# Specify the user id of the current user here as user@realm
#dtap_app_login_user = ""
# JVM args for DTap distributions, provide args seperate by space
#dtap_app_jvm_args = ""
# DTap (HDFS) class path. NOTE: set "hdfs_app_classpath" also
#dtap_app_classpath = ""

# S3 Connector credentials
#aws_access_key_id = ""
#aws_secret_access_key = ""
#aws_role_arn = ""

# What region to use when none is specified in the s3 url.
# Ignored when aws_s3_endpoint_url is set.
#aws_default_region = ""

# Sets enpoint URL that will be used to access S3.
#aws_s3_endpoint_url = ""

# If set to true S3 Connector will try to to obtain credentials assiciated with
# the role attached to the EC2 instance.
#aws_use_ec2_role_credentials = false

# Starting S3 path displayed in UI S3 browser
#s3_init_path = "s3://h2o-public-test-data/smalldata/"

# GCS Connector credentials
# example (suggested) -- "/licenses/my_service_account_json.json"
#gcs_path_to_service_account_json = ""

# Minio Connector credentials
#minio_endpoint_url = ""
#minio_access_key_id = ""
#minio_secret_access_key = ""

# Snowflake Connector credentials
# Recommended Provide: url, user, password
# Optionally Provide: account, user, password
# Example URL: https://<snowflake_account>.<region>.snowflakecomputing.com
#snowflake_url = ""
#snowflake_user = ""
#snowflake_password = ""
#snowflake_account = ""

# KDB Connector credentials
#kdb_user = ""
#kdb_password = ""
#kdb_hostname = ""
#kdb_port = ""
#kdb_app_classpath = ""
#kdb_app_jvm_args = ""

# Azure Blob Store Connector credentials
#azure_blob_account_name = ""
#azure_blob_account_key = ""
#azure_connection_string = ""

# Notification scripts
# - the variable points to a location of script which is executed at given event in experiment lifecycle
# - the script should have executable flag enabled
# - use of absolute path is suggested
# The on experiment start notification script location
#listeners_experiment_start = ""
# The on experiment finished notification script location
#listeners_experiment_done = ""

# Default AWS credentials to be used for scorer deployments.
#deployment_aws_access_key_id = ""
#deployment_aws_secret_access_key = ""
#deployment_aws_bucket_name = ""

# Allow the browser to store e.g. login credentials in login form (set to false for higher security)
#allow_form_autocomplete = true

# Enable Projects workspace (alpha version, for evalulation)
#enable_projects = false

##############################################################################
## END

댓글 없음:

댓글 쓰기