[encoder] n_layer = 6 n_head = 8 hidden_size = 512 ffn_hidden_size = 2048 vocab_size = 32128 n_positions = 1024 has_position_embedding = False has_token_type_embedding = False has_embedding_layernorm = False has_embedding_scale = False q_scaling = 0.125 has_attention_qkvo_bias = False has_mlp_bias = False has_model_final_layernorm = True layernorm_eps = 1e-6 layernorm_position = pre_layernorm layernorm_type = RmsNorm hidden_act = relu relative_attention = True num_buckets = 32 max_distance = 128 storage_dtype = float32 [decoder] n_layer = 6 n_head = 8 hidden_size = 512 ffn_hidden_size = 2048 vocab_size = 32128 n_positions = 1024 has_position_embedding = False has_token_type_embedding = False has_embedding_layernorm = False has_embedding_scale = False q_scaling = 0.125 has_attention_qkvo_bias = False has_mlp_bias = False has_model_final_layernorm = True layernorm_eps = 1e-6 layernorm_position = pre_layernorm layernorm_type = RmsNorm hidden_act = relu has_lm_head_bias = False relative_attention = True num_buckets = 32 max_distance = 128 storage_dtype = float32