Chainerの使い方と自然言語処理への応用

Chainer

2015/11/25 WebDB forum@

Preferred Infrastructure

v1.5

l -2008

l l 2008-2011

l l 2011-

l

l l JubatusChainer

NLP2014- 2015,

2

Chainer

http://chainer.org/

3

v1.5

l Chainer l Deep Learning

l CUDA CUDA NVIDIA

l l Recurrent Net Chainer

4

l l

l 6

l

l

7

DAG =

l l

8

z = x ** 2 + 2 * x * y + y

9

x

y

_ ** 2

2 * _ _ * _ _ + _ z

_ + _

1. 2. 3.

10

11

argminw (x, y) l(x, y; w)

l xy l l(x, y)

l

l w

l w l

12

initialize w until converge: w := w - d/dw L(x, y; w)

(chain rule)

13

z = h(y),y = g(x),x = f(w)

z

w=

z

y

y

x

x

w= Dh(y)Dg(x)Df (w)

1. l

2. l

3. l

15

Recurrent Net

l l t=T t=T-1 t=T

16

T

T-1

T

Recurrent Net

l DAG l DAG Backprop

Through Time17

t=1

t=2

t=3

t=4

Truncated BPTT

l

l Truncated BPTT18

t=1

t=2

t=3

t=4

Truncated

Chainer

Chainer

l l l l

l Chainer l Python l

20

Chainer

l Linux Ubuntu l

l Python CPython 2.7+, 3.4+, 3.5+ l pip l pip install chainer l chainer import

l Python Anaconda

l Python pyenv l pyenv Anaconda

21

l Chainer Variable

l Variable Function Variable l Variable

l Function chainer.functions

22

23

x

y

_**2

2*_ _*_ _+_ z

_+_

x = Varaible(...) y = Variable(...) z = x ** 2 + 2 * x * y + y

Variable

l l NumPy CuPy

l l data

l Function l x 20 10

l Chainer float32

24

x = Variable(np.zeros((10, 20), dtype=np.float32)) x.data

Function

l l chainer.functions ( F)

l F.relu, F.max_pooling_2d, F.lstm, ...

l FunctionVariable

l v1.5Link

25

x = Variable(...) y = F.relu(x) # yVariable

Link

l l l save/loadv1.5save/load l chainer.linksL

l L.Linear, L.Convolution2D, L.EmbedID, ...

l LinkVariable

l v1.5FunctionLink

26

v1.5~

ChainLink

l LinkChain

l Chain model = Chain(embed=L.EmbedID(10000, 100),

layer1=L.Linear(100, 100), layer2=L.Linear(100, 10000))

x = Variable(...)

h = F.relu(model.layer1(model.embed(x)))

y = model.layer2(h)

27

v1.5~

l Function l Variable.backward()

loss = F.softmax_cross_entropy(y, t) loss.backward()

28

Optimizer

l l Optimizer

l chainer.optimizers l SGD, MomentumSGD, AdaGrad,

RMSprop, RMSpropGraves, AdaDelta, Adam

l setup l hook optimizer = optimizers.SGD()

optimizer.setup(model) optimizer.add_hook(optimizer.WeightDecay())

29

Optimizer

l zerograds() l l update() l

model.zerograds()

loss = ... loss.backward()

optimizer.update()

30

Chainer

1. LinkChain 2. Optimizer Chain 3. forward 4. 5.

a. b. backward c. Optimizerupdate

6. a.

31

Function

l Function Python l forward(_cpu/_gpu) backward(_cpu/_gpu)

l l LinkFunction

32

Function

class SquaredDiff(Function): def forward_cpu(self, inputs):

x, y = inputs

z = x y

return z * z,

def backward_cpu(self, inputs, grad_outputs):

x, y = inputs

gz = grad_outputs

gx = 2 * (x y) * gz

return gx, -gx

33

tuple

tuple

Function

l Function l (gradient check)

l forward backward

l chainer.gradient_check.numerical_grad

l tests/chainer_tests/function_tests

34

CUDA

l CuPy: CUDA l NumPy

l l reshape

l elementwise, reduction

35

CuPy

l CUDA GPU l CUDA 6.5

l Ubuntu deb

l l PATH LD_LIBRARY_PATH l /usr/local/cuda

l PATH=/usr/local/cuda/bin:$PATH l LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

l Chainer cupy import

36

CuPy

l numpy cupy NumPy

l CPU/GPU l chainer.cuda.get_array_module()

cupy.ndarray numpy / cupy

l NumPy CuPy logsumexp

def logsumexp(x, axis=None):

xp = cuda.get_array_module(x) x_max = x.max(axis=axis)

return x_max + xp.log(xp.exp(x

x_max).sum(axis=axis))

37

Chainer

l word2vec l Skip-gram/Continuous BoW l Negative Sampling/Hierarchical Softmax

l Recurrent Neural Network l Long-short term memory l Gated recurrent unit

l Recursive Neural Network l Neural Tensor Network

39

Recurrent Neural Network

l l

40

x1 x2 x3 x4

h1 h2 h3 h4

y1 y2 y3 y4

h0yi xi+1

LSTM-RNN

class RNNLM(chainer.Chain): def __init__(self, n_vocab, n_units, train=True):

super(RNNLM, self).__init__(

embed=L.EmbedID(n_vocab, n_units), l1=L.LSTM(n_units, n_units), l2=L.LSTM(n_units, n_units), l3=L.Linear(n_units, n_vocab)) self.train = train

41

2LSTM

LSTM-RNN

def reset_state(self): self.l1.reset_state()

self.l2.reset_state()

def __call__(self, x):

h0 = self.embed(x) h1 = self.l1(F.dropout(h0, train=self.train)) h2 = self.l2(F.dropout(h1, train=self.train)) y = self.l3(F.dropout(h2, train=self.train))

return y

42

LSTM

LSTM-RNN

for i in range(jump * n_epoch):

x = chainer.Variable(...)

t = chainer.Variable(...)

loss_i = model(x, t) accum_loss += loss_i ...

model.zerograds()

accum_loss.backward()

accum_loss.unchain_backward() # truncate accum_loss = 0 optimizer.update()

43

truncated BPTT

word2vec

l CBoW l Skip-

gram44

word2vec

class ContinuousBoW(chainer.Chain): def __init__(self, n_vocab, n_units, loss_func):

super(ContinuousBoW, self).__init__(

embed=F.EmbedID(n_vocab, args.unit), loss_func=loss_func)

def __call__(self, x, context):

h = None

for c in context:

e = self.embed(c)

h = h + e if h is not None else e return self.loss_func(h, x)

45

Recursive Neural Network

l

l Recurrent NetRecursive Net

46

x1 x2

p1

x3

p2

p1 = f(x1, x2) p2 = f(p1, x3)

Recursive Neural Network

class RecursiveNet(chainer.Chain): def __init__(self, n_vocab, n_units):

super(RecursiveNet, self).__init__(

embed=L.EmbedID(n_vocab, n_units),

l=L.Linear(n_units * 2, n_units),

w=L.Linear(n_units, n_label))

def leaf(self, x):

return self.embed(x)

def node(self, left, right): return F.tanh(self.l(F.concat((left, right))))

47

leaf

node

Recursive Net

def traverse(model, node): if isinstance(node['node'], int): # leaf node

word = xp.array([node['node']], np.int32)

loss = 0

x = chainer.Variable(word)

v = model.leaf(x) else: # internal node

left_node, right_node = node['node']

left_loss, left = traverse(model, left_node) right_loss, right = traverse(model, right_node) v = model.node(left, right) loss = left_loss + right_loss

48

Examples

examples l mnist: MNIST

l imagenet: ImageNet ConvNet l modelzoo: Caffe l ptb: Penn-Tree Bank LSTM

l Truncated BPTT

l word2vec: word2vec PTB l sentiment: Recursive Net

49

l

l Chainer

l Chainer

l Chainer

50

Chainerの使い方と自然言語処理への応用

Technology

Transcript of Chainerの使い方と自然言語処理への応用