diff --git a/python/dllib/src/bigdl/dllib/keras/layers/self_attention.py b/python/dllib/src/bigdl/dllib/keras/layers/self_attention.py
index 8e5609b7093..f0114d4bdaf 100644
--- a/python/dllib/src/bigdl/dllib/keras/layers/self_attention.py
+++ b/python/dllib/src/bigdl/dllib/keras/layers/self_attention.py
@@ -47,6 +47,14 @@ class TransformerLayer(ZooKerasLayer):
     """
     A self attention layer
 
+    Input is a list which consists of 2 ndarrays.
+    1. Token id ndarray: shape [batch, seqLen] with the word token indices in the vocabulary
+    2. Position id ndarray: shape [batch, seqLen] with positions in the sentence.
+    Output is a list which contains:
+    1. The states of Transformer layer.
+    2. The pooled output which processes the hidden state of the last layer with regard to the first
+    token of the sequence. This would be useful for segment-level tasks.
+
     # Arguments
     nBlock: block number
     hidden_drop: drop probability off projection
@@ -202,7 +210,7 @@ def init(cls, vocab=40990, seq_len=77, n_block=12, hidden_drop=0.1,
         output_all_block: whether output all blocks' output
         """
         if hidden_size < 0:
-            raise TypeError('hidden_size must be greater than 0 with default embeddding layer')
+            raise TypeError('hidden_size must be greater than 0 with default embedding layer')
         from bigdl.nn.layer import Squeeze
         word_input = InputLayer(input_shape=(seq_len,))
         postion_input = InputLayer(input_shape=(seq_len,))
@@ -226,21 +234,24 @@ def init(cls, vocab=40990, seq_len=77, n_block=12, hidden_drop=0.1,
 class BERT(TransformerLayer):
     """
     A self attention layer.
-    Input is a List which consists of 4 ndarrays.
+    Input is a list which consists of 4 ndarrays.
     1. Token id ndarray: shape [batch, seqLen] with the word token indices in the vocabulary
     2. Token type id ndarray: shape [batch, seqLen] with the token types in [0, 1].
-       0 menas `sentence A` and 1 means a `sentence B` (see BERT paper for more details).
+       0 means `sentence A` and 1 means a `sentence B` (see BERT paper for more details).
     3. Position id ndarray: shape [batch, seqLen] with positions in the sentence.
     4. Attention_mask ndarray: shape [batch, seqLen] with indices in [0, 1].
        It's a mask to be used if the input sequence length is smaller than seqLen in
        the current batch.
-    Output is a list which output the states of BERT layer
+    Output is a list which contains:
+    1. The states of BERT layer.
+    2. The pooled output which processes the hidden state of the last layer with regard to the first
+    token of the sequence. This would be useful for segment-level tasks.
 
     # Arguments
     n_block: block number
     n_head: head number
     intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-    hidden_drop: The dropout probabilitiy for all fully connected layers
+    hidden_drop: The dropout probability for all fully connected layers
     attn_drop: drop probability of attention
     initializer_ranger: weight initialization range
     output_all_block: whether output all blocks' output