CNN for Sentence Classification

论文-Convolutional Neural Networks for Sentence Classification

实现了原论文中的三种模式(这三种模式的区别在于embedding层的实现):

  1. CNN-non-static

    使用pre-trained的word vector来初始化embedding层

  2. CNN-static

    使用pre-trained的word vector来直接把输入转化成稠密词向量的形式

  3. CNN-rand

    随机初始化embedding层

CNN-non-static

模型结构

cnn-non-static-model

核心代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#coding:utf8
from keras.models import Sequential,Model
from keras.layers import Dense,Dropout,Flatten,Input,MaxPooling1D,GlobalMaxPooling1D,Convolution1D,Embedding
from keras import initializers
from keras.layers.merge import Concatenate
from keras.datasets import imdb #dataset
from keras.preprocessing import sequence #padding
from data_helper import load_data
from w2v import train_word2vec
import numpy as np
np.random.seed(0)

#CNN-non-static
embedding_dim = 50
filter_sizes = (3,4,5) #根据论文中的超参数设置
#filter_sizes = (3,8)
num_filters = 10
dropout_prob = (0.5,0.8)
hidden_dims = 50
#Training parameters
batch_size = 64
num_epochs = 10
#Preprocessing parameters
sequence_length = 400
max_words = 5000
#w2v parameters(see train_word2vec)
min_word_count = 1
context = 10

#DataPreparation
print("Load data...")
x_train,y_train,x_test,y_test,vocabulary_inv = load_data()

if sequence_length!=x_test.shape[1]:
print("Adjusting sequence length for actual size")
sequence_length = x_test.shape[1]
print("x_train shape:",x_train.shape)
print("x_test shape:",x_test.shape)
print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))

#prepare embedding layer weights
embedding_weights = train_word2vec(np.vstack((x_train,x_test)),vocabulary_inv,num_features=embedding_dim,min_word_count=min_word_count,context=context)

#input layer
input_shape = (sequence_length,)
model_input = Input(shape=input_shape)

#embedding layer,使用训练好的w2v模型来初始化embedding层
weights = np.array([v for v in embedding_weights.values()])
initializer = initializers.Constant(weights)
z = Embedding(len(vocabulary_inv), embedding_dim,embeddings_initializer=initializer,input_length=sequence_length, name='embedding')(model_input)
z = Dropout(dropout_prob[0])(z)

#Convolutional block
conv_blocks = []
for sz in filter_sizes:
conv = Convolution1D(filters=num_filters, kernel_size=sz,
padding="valid",activation="relu",
strides=1)(z)
conv = MaxPooling1D(pool_size=2)(conv)
#conv = GlobalMaxPooling1D()(conv)
conv = Flatten()(conv)
conv_blocks.append(conv)
z = Concatenate()(conv_blocks) if len(conv_blocks)>1 else conv_blocks[0]
z = Dropout(dropout_prob[1])(z)

#dense layer
z = Dense(hidden_dims, activation="relu")(z)
#output layer
model_output = Dense(1, activation="sigmoid")(z)

model = Model(model_input, model_output)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

#模型训练
model.fit(x_train,y_train,batch_size=batch_size,epochs=num_epochs,validation_data=(x_test,y_test),verbose=2)

实验结果

cnn-non-static

CNN-static

模型结构

cnn-static-model

核心代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#coding:utf8
from keras.models import Sequential,Model
from keras.layers import Dense,Dropout,Flatten,Input,MaxPooling1D,GlobalMaxPooling1D,Convolution1D,Embedding
from keras import initializers
from keras.layers.merge import Concatenate
from keras.datasets import imdb #dataset
from keras.preprocessing import sequence #padding
from data_helper import load_data
from w2v import train_word2vec
import numpy as np
np.random.seed(0)

#CNN-static
embedding_dim = 50
filter_sizes = (3,4,5)
num_filters = 10
dropout_prob = (0.5,0.8)
hidden_dims = 50
#Training parameters
batch_size = 64
num_epochs = 10
#Preprocessing parameters
sequence_length = 400
max_words = 5000
#w2v paramters
min_word_count = 1
context = 10

x_train,y_train,x_test,y_test,vocabulary_inv = load_data()

if sequence_length!=x_test.shape[1]:
print("Adjusting sequence length for actual size")
sequence_length = x_test.shape[1]
print("x_train shape:",x_train.shape)
print("x_test shape:",x_test.shape)
print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))

#prepare embedding layer weights
embedding_weights = train_word2vec(np.vstack((x_train,x_test)), vocabulary_inv, num_features=embedding_dim,min_word_count=min_word_count,context=context)

x_train = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_train])
x_test = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_test])
print("x_train static shape:", x_train.shape)
print("x_test static shape:",x_test.shape)

#input layer
input_shape = (sequence_length,embedding_dim)
model_input = Input(shape=input_shape)
z = model_input
z = Dropout(dropout_prob[0])(z)


#Convolutional block
conv_blocks = []
for sz in filter_sizes:
conv = Convolution1D(filters=num_filters, kernel_size=sz,
padding="valid",activation="relu",
strides=1)(z)
conv = MaxPooling1D(pool_size=2)(conv)
#conv = GlobalMaxPooling1D()(conv)
conv = Flatten()(conv)
conv_blocks.append(conv)
z = Concatenate()(conv_blocks) if len(conv_blocks)>1 else conv_blocks[0]
z = Dropout(dropout_prob[1])(z)

#dense layer
z = Dense(hidden_dims, activation="relu")(z)
#output layer
model_output = Dense(1, activation="sigmoid")(z)

model = Model(model_input, model_output)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

#模型训练
model.fit(x_train,y_train,batch_size=batch_size,epochs=num_epochs,validation_data=(x_test,y_test),verbose=2)

实验结果

cnn-static

CNN-rand

模型结构

cnn-rand-model

核心代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#coding:utf8
from keras.models import Sequential,Model
from keras.layers import Dense,Dropout,Flatten,Input,MaxPooling1D,GlobalMaxPooling1D,Convolution1D,Embedding
from keras import initializers
from keras.layers.merge import Concatenate
from keras.datasets import imdb #dataset
from keras.preprocessing import sequence #padding
from data_helper import load_data
from w2v import train_word2vec
import numpy as np
np.random.seed(0)

embedding_dim = 50
filter_sizes = (3,4,5) #根据论文中的超参数设置
num_filters = 10
dropout_prob = (0.5,0.8)
hidden_dims = 50
#Training parameters
batch_size = 64
num_epochs = 10
#Preprocessing parameters
sequence_length = 400
max_words = 5000
#w2v parameters(see train_word2vec)
min_word_count = 1
context = 10

#DataPreparation
print("Load data...")
x_train,y_train,x_test,y_test,vocabulary_inv = load_data()

if sequence_length!=x_test.shape[1]:
print("Adjusting sequence length for actual size")
sequence_length = x_test.shape[1]
print("x_train shape:",x_train.shape)
print("x_test shape:",x_test.shape)
print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))


#input layer
input_shape = (sequence_length,)
model_input = Input(shape=input_shape)

#embedding layer
z = Embedding(len(vocabulary_inv), embedding_dim, input_length=sequence_length,name="embedding")(model_input)
z = Dropout(dropout_prob[0])(z)

#Convolutional block
conv_blocks = []
for sz in filter_sizes:
conv = Convolution1D(filters=num_filters, kernel_size=sz,
padding="valid",activation="relu",
strides=1)(z)
conv = MaxPooling1D(pool_size=2)(conv)
#conv = GlobalMaxPooling1D()(conv)
conv = Flatten()(conv)
conv_blocks.append(conv)
z = Concatenate()(conv_blocks) if len(conv_blocks)>1 else conv_blocks[0]
z = Dropout(dropout_prob[1])(z)

#dense layer
z = Dense(hidden_dims, activation="relu")(z)
#output layer
model_output = Dense(1, activation="sigmoid")(z)

model = Model(model_input, model_output)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

#模型训练
model.fit(x_train,y_train,batch_size=batch_size,epochs=num_epochs,validation_data=(x_test,y_test),verbose=2)

实验结果

cnn-rand