[encoder]
n_layer = 6
n_head = 8
hidden_size = 512
ffn_hidden_size = 2048
vocab_size = 32128
n_positions = 1024
has_position_embedding = False
has_token_type_embedding = False
has_embedding_layernorm = False
has_embedding_scale = False
q_scaling = 0.125
has_attention_qkvo_bias = False
has_mlp_bias = False
has_model_final_layernorm = True
layernorm_eps = 1e-6
layernorm_position = pre_layernorm
layernorm_type = RmsNorm
hidden_act = relu
relative_attention = True
num_buckets = 32
max_distance = 128
storage_dtype = float32

[decoder]
n_layer = 6
n_head = 8
hidden_size = 512
ffn_hidden_size = 2048
vocab_size = 32128
n_positions = 1024
has_position_embedding = False
has_token_type_embedding = False
has_embedding_layernorm = False
has_embedding_scale = False
q_scaling = 0.125
has_attention_qkvo_bias = False
has_mlp_bias = False
has_model_final_layernorm = True
layernorm_eps = 1e-6
layernorm_position = pre_layernorm
layernorm_type = RmsNorm
hidden_act = relu
has_lm_head_bias = False
relative_attention = True
num_buckets = 32
max_distance = 128
storage_dtype = float32