.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python %matplotlib inline from mxnet import np, npx from d2l import mxnet as d2l npx.set_np() def init_adadelta_states(feature_dim): s_w, s_b = np.zeros((feature_dim, 1)), np.zeros(1) delta_w, delta_b = np.zeros((feature_dim, 1)), np.zeros(1) return ((s_w, delta_w), (s_b, delta_b)) def adadelta(params, states, hyperparams): rho, eps = hyperparams['rho'], 1e-5 for p, (s, delta) in zip(params, states): # In-placeupdatesvia[:] s[:] = rho * s + (1 - rho) * np.square(p.grad) g = (np.sqrt(delta + eps) / np.sqrt(s + eps)) * p.grad p[:] -= g delta[:] = rho * delta + (1 - rho) * g * g .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python %matplotlib inline import torch from d2l import torch as d2l def init_adadelta_states(feature_dim): s_w, s_b = torch.zeros((feature_dim, 1)), torch.zeros(1) delta_w, delta_b = torch.zeros((feature_dim, 1)), torch.zeros(1) return ((s_w, delta_w), (s_b, delta_b)) def adadelta(params, states, hyperparams): rho, eps = hyperparams['rho'], 1e-5 for p, (s, delta) in zip(params, states): with torch.no_grad(): # In-placeupdatesvia[:] s[:] = rho * s + (1 - rho) * torch.square(p.grad) g = (torch.sqrt(delta + eps) / torch.sqrt(s + eps)) * p.grad p[:] -= g delta[:] = rho * delta + (1 - rho) * g * g p.grad.data.zero_() .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python %matplotlib inline import tensorflow as tf from d2l import tensorflow as d2l def init_adadelta_states(feature_dim): s_w = tf.Variable(tf.zeros((feature_dim, 1))) s_b = tf.Variable(tf.zeros(1)) delta_w = tf.Variable(tf.zeros((feature_dim, 1))) delta_b = tf.Variable(tf.zeros(1)) return ((s_w, delta_w), (s_b, delta_b)) def adadelta(params, grads, states, hyperparams): rho, eps = hyperparams['rho'], 1e-5 for p, (s, delta), grad in zip(params, states, grads): s[:].assign(rho * s + (1 - rho) * tf.math.square(grad)) g = (tf.math.sqrt(delta + eps) / tf.math.sqrt(s + eps)) * grad p[:].assign(p - g) delta[:].assign(rho * delta + (1 - rho) * g * g) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python %matplotlib inline import warnings from d2l import paddle as d2l warnings.filterwarnings("ignore") import paddle def init_adadelta_states(feature_dim): s_w, s_b = paddle.zeros(shape=(feature_dim, 1)), paddle.zeros(shape=(1, )) delta_w, delta_b = paddle.zeros(shape=(feature_dim, 1)), paddle.zeros(shape=(1, )) return ((s_w, delta_w), (s_b, delta_b)) def adadelta(params, states, hyperparams): a = [] rho, eps = hyperparams['rho'], 1e-5 for p, (s, delta) in zip(params, states): with paddle.no_grad(): # In-placeupdatesvia[:] s[:] = rho * s + (1 - rho) * paddle.square(p.grad) g = (paddle.sqrt(delta + eps) / paddle.sqrt(s + eps)) * p.grad p[:] -= g delta[:] = rho * delta + (1 - rho) * g * g p.grad.zero_() a.append(p) return a .. raw:: html

.. raw:: html

mxnet pytorch tensorflow paddle

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python data_iter, feature_dim = d2l.get_data_ch11(batch_size=10) d2l.train_ch11(adadelta, init_adadelta_states(feature_dim), {'rho': 0.9}, data_iter, feature_dim); .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output loss: 0.243, 0.101 sec/epoch .. figure:: output_adadelta_0b41cb_18_1.svg .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python data_iter, feature_dim = d2l.get_data_ch11(batch_size=10) d2l.train_ch11(adadelta, init_adadelta_states(feature_dim), {'rho': 0.9}, data_iter, feature_dim); .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output loss: 0.243, 0.014 sec/epoch .. figure:: output_adadelta_0b41cb_21_1.svg .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python data_iter, feature_dim = d2l.get_data_ch11(batch_size=10) d2l.train_ch11(adadelta, init_adadelta_states(feature_dim), {'rho': 0.9}, data_iter, feature_dim); .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output loss: 0.243, 0.148 sec/epoch .. figure:: output_adadelta_0b41cb_24_1.svg .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python data_iter, feature_dim = d2l.get_data_ch11(batch_size=10) d2l.train_ch11(adadelta, init_adadelta_states(feature_dim), {'rho': 0.9}, data_iter, feature_dim); .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output loss: 0.242, 0.059 sec/epoch .. figure:: output_adadelta_0b41cb_27_1.svg .. raw:: html

.. raw:: html

mxnet pytorch tensorflow paddle

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python d2l.train_concise_ch11('adadelta', {'rho': 0.9}, data_iter) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output loss: 0.243, 0.103 sec/epoch .. figure:: output_adadelta_0b41cb_33_1.svg .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python trainer = torch.optim.Adadelta d2l.train_concise_ch11(trainer, {'rho': 0.9}, data_iter) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output loss: 0.243, 0.013 sec/epoch .. figure:: output_adadelta_0b41cb_36_1.svg .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python # adadeltaisnotconvergingatdefaultlearningrate # butit'sconvergingatlr=5.0 trainer = tf.keras.optimizers.Adadelta d2l.train_concise_ch11(trainer, {'learning_rate':5.0, 'rho': 0.9}, data_iter) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output loss: 0.244, 0.101 sec/epoch .. figure:: output_adadelta_0b41cb_39_1.svg .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python trainer = paddle.optimizer.Adadelta d2l.train_concise_ch11(trainer, {'rho': 0.9}, data_iter) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output loss: 0.268, 0.031 sec/epoch .. figure:: output_adadelta_0b41cb_42_1.svg .. raw:: html

.. raw:: html