2024-09-15 22:41:00 +09:00
|
|
|
|
|
|
|
# %% [markdown]
|
|
|
|
# # Fully-Sharded Data Parallelism
|
|
|
|
|
|
|
|
|
|
|
|
# MARK: START
|
|
|
|
# %%
|
|
|
|
# let's make 8-device simulator
|
|
|
|
import os
|
|
|
|
|
|
|
|
# Set this to True to run the model on CPU only.
|
|
|
|
USE_CPU_ONLY = True
|
|
|
|
|
|
|
|
flags = os.environ.get("XLA_FLAGS", "")
|
|
|
|
if USE_CPU_ONLY:
|
|
|
|
flags += " --xla_force_host_platform_device_count=8" # Simulate 8 devices
|
|
|
|
# Enforce CPU-only execution
|
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
|
|
|
else:
|
|
|
|
# GPU flags
|
|
|
|
flags += (
|
|
|
|
"--xla_gpu_enable_triton_softmax_fusion=true "
|
|
|
|
"--xla_gpu_triton_gemm_any=false "
|
|
|
|
"--xla_gpu_enable_async_collectives=true "
|
|
|
|
"--xla_gpu_enable_latency_hiding_scheduler=true "
|
|
|
|
"--xla_gpu_enable_highest_priority_async_stream=true "
|
|
|
|
)
|
|
|
|
os.environ["XLA_FLAGS"] = flags
|
|
|
|
|
|
|
|
import functools
|
|
|
|
from pprint import pprint
|
|
|
|
from typing import Any, Dict, Tuple, Callable, Sequence
|
|
|
|
|
|
|
|
import flax.linen as nn
|
|
|
|
import jax
|
|
|
|
import jax.numpy as jnp
|
|
|
|
import numpy as np
|
|
|
|
from jax.experimental.shard_map import shard_map
|
|
|
|
from jax.sharding import Mesh, NamedSharding
|
|
|
|
from jax.sharding import PartitionSpec as P
|
|
|
|
from ml_collections import ConfigDict
|
|
|
|
import optax
|
|
|
|
import logging
|
|
|
|
import time
|
|
|
|
|
|
|
|
PyTree = Any
|
|
|
|
Metrics = Dict[str, Tuple[jax.Array, ...]]
|
|
|
|
jax.config.update('jax_platform_name', 'cpu')
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# required functions:
|
|
|
|
# Batch
|
|
|
|
# TrainState
|
|
|
|
# accumulate_gradients
|
|
|
|
# print_metrics
|
|
|
|
from single_gpu_optimizations import Batch, TrainState, accumulate_gradients, print_metrics
|
|
|
|
# %%
|
|
|
|
# import the fold_rng_over_axis
|
|
|
|
|
|
|
|
def fold_rng_over_axis(rng: jax.random.PRNGKey, axis_name: str) -> jax.random.PRNGKey:
|
|
|
|
"""Folds the random number generator over the given axis.
|
|
|
|
|
|
|
|
This is useful for generating a different random number for each device
|
|
|
|
across a certain axis (e.g. the model axis).
|
|
|
|
|
|
|
|
Args:
|
|
|
|
rng: The random number generator.
|
|
|
|
axis_name: The axis name to fold the random number generator over.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A new random number generator, different for each device index along the axis.
|
|
|
|
"""
|
|
|
|
axis_index = jax.lax.axis_index(axis_name)
|
|
|
|
return jax.random.fold_in(rng, axis_index)
|
|
|
|
|
|
|
|
# MARK: DATA PARALLELISM
|
|
|
|
# %% [markdown]
|
|
|
|
# # Data Parallelism
|
|
|
|
# we start with plain data parallelism
|
|
|
|
#
|
|
|
|
# using shard_map, we write single-device code and let shard map handle the rest
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# plain data parallel - sharding only data inputs and outputs
|
|
|
|
class DPClassifier(nn.Module):
|
|
|
|
# contains the attributes listed in config
|
|
|
|
# hidden_size
|
|
|
|
# dropout_rate
|
|
|
|
# dtype - for computation
|
|
|
|
# num_classes
|
|
|
|
# data_axis_name
|
|
|
|
config: ConfigDict
|
|
|
|
|
|
|
|
# note how there is no data_axis_name within the actual __call__
|
|
|
|
@nn.compact
|
|
|
|
def __call__(self, x: jax.Array, train: bool) -> jax.Array:
|
|
|
|
x = nn.Dense(
|
|
|
|
features=self.config.hidden_size,
|
|
|
|
dtype=self.config.dtype,
|
|
|
|
name="input_dense",
|
|
|
|
)(x)
|
|
|
|
x = nn.silu(x)
|
|
|
|
x = nn.Dropout(rate=self.config.dropout_rate, deterministic=not train)(x)
|
|
|
|
x = nn.Dense(
|
|
|
|
features=self.config.num_classes,
|
|
|
|
dtype=self.config.dtype,
|
|
|
|
name="output_dense",
|
|
|
|
)(x)
|
|
|
|
x = x.astype(jnp.float32)
|
|
|
|
return x
|
|
|
|
|
|
|
|
# config
|
|
|
|
data_config = ConfigDict(
|
|
|
|
dict(
|
|
|
|
batch_size=128,
|
|
|
|
num_classes=10,
|
|
|
|
input_size=784,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
model_config = ConfigDict(
|
|
|
|
dict(
|
|
|
|
hidden_size=512,
|
|
|
|
dropout_rate=0.1,
|
|
|
|
dtype=jnp.bfloat16,
|
|
|
|
num_classes=data_config.num_classes,
|
|
|
|
data_axis_name="data",
|
|
|
|
)
|
|
|
|
)
|
|
|
|
optimizer_config = ConfigDict(
|
|
|
|
dict(
|
|
|
|
learning_rate=1e-3,
|
|
|
|
num_minibatches=4,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
config = ConfigDict(
|
|
|
|
dict(
|
|
|
|
model=model_config,
|
|
|
|
optimizer=optimizer_config,
|
|
|
|
data=data_config,
|
|
|
|
data_axis_name=model_config.data_axis_name,
|
|
|
|
seed=42,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# initialize
|
|
|
|
model_dp = DPClassifier(config=config.model)
|
|
|
|
optimizer = optax.adamw(
|
|
|
|
learning_rate=config.optimizer.learning_rate,
|
|
|
|
)
|
|
|
|
|
|
|
|
# init rng
|
|
|
|
rng = jax.random.PRNGKey(config.seed)
|
|
|
|
# init model rng
|
|
|
|
model_init_rng, data_inputs_rng, data_labels_rng = jax.random.split(rng, 3)
|
|
|
|
# create synthetic data
|
|
|
|
batch = Batch(
|
|
|
|
inputs=jax.random.normal(data_inputs_rng, (config.data.batch_size, config.data.input_size)),
|
|
|
|
labels=jax.random.randint(
|
|
|
|
data_labels_rng, (config.data.batch_size,), 0, config.data.num_classes
|
|
|
|
),
|
|
|
|
)
|
|
|
|
|
|
|
|
# init data_parallel TrainState state
|
|
|
|
def init_dp(rng: jax.random.PRNGKey, x: jax.Array, model: nn.Module) -> TrainState:
|
|
|
|
init_rng, rng = jax.random.split(rng)
|
|
|
|
variables = model.init({"params": init_rng}, x, train=False)
|
|
|
|
params = variables.pop("params")
|
|
|
|
state = TrainState.create(
|
|
|
|
apply_fn=model.apply,
|
|
|
|
params=params,
|
|
|
|
tx=optimizer,
|
|
|
|
rng=rng,
|
|
|
|
)
|
|
|
|
return state
|
|
|
|
|
|
|
|
# create mesh
|
|
|
|
device_array = np.array(jax.devices())
|
|
|
|
mesh = Mesh(device_array, (config.data_axis_name,))
|
|
|
|
|
|
|
|
# we are just sharding the same model across devices
|
|
|
|
# no different from a flax replicate
|
|
|
|
init_dp_fn = jax.jit(
|
|
|
|
shard_map(
|
|
|
|
functools.partial(init_dp, model=model_dp),
|
|
|
|
mesh,
|
|
|
|
in_specs=(P(), P(config.data_axis_name)),
|
|
|
|
out_specs=P(),
|
|
|
|
check_rep=False,
|
|
|
|
),
|
|
|
|
)
|
|
|
|
|
|
|
|
state_dp = init_dp_fn(model_init_rng, batch.inputs)
|
|
|
|
print("DP Parameters")
|
|
|
|
pprint(jax.tree.map(lambda x: (x.shape, x.sharding), state_dp.params))
|
|
|
|
|
|
|
|
# MARK: TRAIN STEP
|
|
|
|
# %%
|
|
|
|
# train step
|
|
|
|
def loss_fn(
|
|
|
|
params: PyTree, apply_fn: Any, batch: Batch, rng: jax.Array
|
|
|
|
) -> Tuple[jax.Array, Dict[str, Any]]:
|
|
|
|
|
|
|
|
# set different rng over various devices
|
|
|
|
dropout_rng = fold_rng_over_axis(rng, config.data_axis_name)
|
|
|
|
|
|
|
|
# Remaining computation is the same as before for single device.
|
|
|
|
logits = apply_fn(
|
|
|
|
{"params": params},
|
|
|
|
batch.inputs,
|
|
|
|
train=True,
|
|
|
|
rngs={"dropout": dropout_rng})
|
|
|
|
loss = optax.softmax_cross_entropy_with_integer_labels(logits, batch.labels)
|
|
|
|
correct_pred = jnp.equal(jnp.argmax(logits, axis=-1), batch.labels)
|
|
|
|
batch_size = batch.inputs.shape[0]
|
|
|
|
step_metrics = {"loss": (loss.sum(), batch_size), "accuracy": (correct_pred.sum(), batch_size)}
|
|
|
|
loss = loss.mean()
|
|
|
|
return loss, step_metrics
|
|
|
|
|
|
|
|
# train step dp
|
|
|
|
# simple data parallel has the model on every device
|
|
|
|
# but each device has different data
|
|
|
|
def train_step_dp(
|
|
|
|
state: TrainState,
|
|
|
|
metrics: Metrics | None,
|
|
|
|
batch: Batch,
|
|
|
|
) -> Tuple[TrainState, Metrics]:
|
|
|
|
rng, step_rng = jax.random.split(state.rng)
|
|
|
|
# accumulate gradients like before
|
|
|
|
grads, step_metrics = accumulate_gradients(
|
|
|
|
state,
|
|
|
|
batch,
|
|
|
|
step_rng,
|
|
|
|
config.optimizer.num_minibatches,
|
|
|
|
loss_fn=loss_fn,
|
|
|
|
)
|
|
|
|
# Update parameters. We need to sync the gradients across devices before updating.
|
|
|
|
with jax.named_scope("sync_gradients"):
|
|
|
|
grads = jax.tree.map(
|
|
|
|
lambda g: jax.lax.pmean(
|
|
|
|
g, axis_name=config.data_axis_name),
|
|
|
|
grads)
|
|
|
|
new_state = state.apply_gradients(grads=grads, rng=rng)
|
|
|
|
|
|
|
|
# Sum metrics across replicas. Alternatively, we could keep the metrics separate
|
|
|
|
# and only synchronize them before logging. For simplicity, we sum them here.
|
|
|
|
with jax.named_scope("sync_metrics"):
|
|
|
|
step_metrics = jax.tree.map(
|
|
|
|
lambda x: jax.lax.psum(x, axis_name=config.data_axis_name), step_metrics
|
|
|
|
)
|
|
|
|
|
|
|
|
if metrics is None:
|
|
|
|
metrics = step_metrics
|
|
|
|
else:
|
|
|
|
# combine all the synced metrics
|
|
|
|
metrics = jax.tree.map(jnp.add, metrics, step_metrics)
|
|
|
|
|
|
|
|
return new_state, metrics
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# we will now wrap the train step with shard_map and jit it
|
|
|
|
# here we will be sharding input and output data
|
|
|
|
train_step_dp_fn = jax.jit(
|
|
|
|
shard_map(
|
|
|
|
train_step_dp,
|
|
|
|
mesh,
|
|
|
|
in_specs=(P(), P(), P(config.data_axis_name)),
|
|
|
|
out_specs=(P(), P()),
|
|
|
|
check_rep=False,
|
|
|
|
),
|
|
|
|
# state and metrics change and won't be re-used
|
|
|
|
# pass by reference and throw away with function
|
|
|
|
donate_argnames=("state", "metrics"),
|
|
|
|
)
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# get the metric_shapes so that we can init arrays for accumulation
|
|
|
|
_, metric_shapes = jax.eval_shape(
|
|
|
|
train_step_dp_fn,
|
|
|
|
state_dp,
|
|
|
|
None,
|
|
|
|
batch,
|
|
|
|
)
|
|
|
|
# init arrays with shape
|
|
|
|
metrics_dp = jax.tree.map(
|
|
|
|
lambda x: jnp.zeros(x.shape, dtype=x.dtype),
|
|
|
|
metric_shapes)
|
|
|
|
|
|
|
|
|
|
|
|
# %%
|
|
|
|
start_time = time.time()
|
|
|
|
for _ in range(15):
|
|
|
|
state_dp, metrics_dp = train_step_dp_fn(state_dp, metrics_dp, batch)
|
|
|
|
duration = time.time() - start_time
|
|
|
|
print(duration)
|
|
|
|
|
|
|
|
final_metrics_dp = jax.tree.map(
|
|
|
|
lambda x: jnp.zeros(x.shape, dtype=x.dtype),
|
|
|
|
metric_shapes)
|
|
|
|
state_dp, final_metrics_dp = train_step_dp_fn(
|
|
|
|
state_dp,
|
|
|
|
final_metrics_dp,
|
|
|
|
batch)
|
|
|
|
print_metrics(final_metrics_dp)
|
|
|
|
|
|
|
|
# %%
|
|
|
|
print("DP Parameters")
|
|
|
|
pprint(jax.tree.map(lambda x: (x.shape, x.sharding), state_dp.params))
|
|
|
|
print("Metrics")
|
|
|
|
pprint(jax.tree.map(lambda x: (x.shape, x.sharding), final_metrics_dp))
|
|
|
|
|
|
|
|
####################################################################
|
|
|
|
# stuff works until here
|
|
|
|
# it is still same as flax replicate style in huggingface
|
|
|
|
|
|
|
|
|
|
|
|
# MARK: PARAMETER SHARDING
|
|
|
|
# %% [markdown]
|
|
|
|
# # parameter sharding
|
|
|
|
# Basic strategy: init full parameters on each device, then use
|
|
|
|
# jax.lax.axis_index to split parameters across devices, and keep a shard on
|
|
|
|
# each device
|
|
|
|
#
|
|
|
|
# use nn.Partitioned to annotate sharding spec on parameters
|
|
|
|
# quite similar to PartitionSpec
|
|
|
|
#
|
|
|
|
# parameters are either jax.Array or a flax.linen.Partitioned
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# type annotation
|
|
|
|
Parameter = jax.Array | nn.Partitioned
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# function to shard parameters across devices
|
|
|
|
# look for an axis to equally split across the number of devices
|
|
|
|
# we can specify which parameters to shard, since they vary in size
|
|
|
|
# we set a floor on the size for sharding
|
|
|
|
@jax.named_scope("shard_params")
|
|
|
|
def shard_params(params: PyTree, axis_name: str, min_weight_size: int = 2**18) -> PyTree:
|
|
|
|
"""Shard parameters across the given mesh axis.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
params: The parameters to shard.
|
|
|
|
axis_name: The axis to shard parameters across.
|
|
|
|
min_weight_size: The minimum size of a parameter to shard. Parameters with fewer values will not be sharded.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
PyTree of same structure as params, but with leaves sharded over new axis if possible.
|
|
|
|
"""
|
|
|
|
# axis_index
|
|
|
|
axis_idx = jax.lax.axis_index(axis_name)
|
|
|
|
# number of units in the axis
|
|
|
|
axis_size = jax.lax.psum(1, axis_name)
|
|
|
|
|
|
|
|
# split function
|
|
|
|
# check each parameter if it had been sharded
|
|
|
|
def _split(x: Parameter) -> Parameter:
|
|
|
|
|
|
|
|
# already sharded
|
|
|
|
if isinstance(x, nn.Partitioned):
|
|
|
|
value, names = x.value, x.names
|
|
|
|
# not sharded
|
|
|
|
else:
|
|
|
|
value = x
|
|
|
|
names = (None,) * value.ndim
|
|
|
|
|
|
|
|
# logging only runs on first jit
|
|
|
|
# this section checks for why a parameter is not already sharded on the axis
|
|
|
|
# check for sharded parameters despite being sharded
|
|
|
|
# (that means its on a different axis)
|
|
|
|
if axis_name in names:
|
|
|
|
logging.warning(
|
|
|
|
f"Parameter {value.shape} with names {names} already sharded on axis {axis_name}."
|
|
|
|
)
|
|
|
|
return x
|
|
|
|
# check if parameter is to small
|
|
|
|
elif value.size <= min_weight_size:
|
|
|
|
logging.info(
|
|
|
|
f"Parameter {value.shape} with names {names} too small to shard, size {value.size} < {min_weight_size}."
|
|
|
|
)
|
|
|
|
return x
|
|
|
|
# let's start sharding!
|
|
|
|
else:
|
|
|
|
shape = value.shape
|
|
|
|
idx = np.argsort(shape)[::-1] # Shard along largest possible axis.
|
|
|
|
for i in idx:
|
|
|
|
# this technically runs once because of return
|
|
|
|
# we only shard if we can split evenly across devices
|
|
|
|
# and if it ain't alreayd sharded
|
|
|
|
if shape[i] % axis_size == 0 and names[i] is None:
|
|
|
|
split_size = shape[i] // axis_size
|
|
|
|
p_sharded = nn.Partitioned(
|
|
|
|
value=jax.lax.dynamic_slice_in_dim( # Shard to keep on present device.
|
|
|
|
value,
|
|
|
|
axis_idx * split_size,
|
|
|
|
split_size,
|
|
|
|
axis=i
|
|
|
|
),
|
|
|
|
names=names[:i] + (axis_name,) + names[i + 1 :],
|
|
|
|
)
|
|
|
|
return p_sharded
|
|
|
|
|
|
|
|
logging.warning(
|
|
|
|
f"Could not shard {value.shape} with names {names} on axis {axis_name}, no suitable axis found."
|
|
|
|
)
|
|
|
|
return x
|
|
|
|
|
|
|
|
# we apply the _split function across the parameter pytree
|
|
|
|
return jax.tree.map(
|
|
|
|
_split,
|
|
|
|
params,
|
|
|
|
is_leaf=lambda x: isinstance(
|
|
|
|
x, nn.Partitioned
|
|
|
|
), # Consider a nn.Partitioned object as a leaf.
|
|
|
|
)
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# function to gather parameters back to a single device
|
|
|
|
|
|
|
|
# but first we need create a custom function for mean gradient computation
|
|
|
|
# jax.lax.all_gather -> retrieve shards and assemble full array in each device
|
|
|
|
# jax.lax.psum_scatter -> scatter gradients back to respective devices
|
|
|
|
def gather_array_with_mean_grads(x: jax.Array, axis: int, axis_name: str):
|
|
|
|
"""Gathering with averaging gradients across replicas."""
|
|
|
|
axis_size = jax.lax.psum(1, axis_name)
|
|
|
|
|
|
|
|
# Define a custom gradient for the gather operation.
|
|
|
|
@jax.custom_gradient
|
|
|
|
def f(x):
|
|
|
|
# adjust backward to turn sum into mean of axis
|
|
|
|
def grad_fn(g):
|
|
|
|
# pmean_scatter from psum_scatter
|
|
|
|
# after computing from full gradient array, our shard only has a
|
|
|
|
# portion of the parameters, we only get the gradients associated
|
|
|
|
# with parameters of our shard
|
|
|
|
return (
|
|
|
|
jax.lax.psum_scatter(g, axis_name, scatter_dimension=axis, tiled=True) / axis_size
|
|
|
|
)
|
|
|
|
|
|
|
|
# assemble shards to form full gradient array
|
|
|
|
return jax.lax.all_gather(x, axis_name, axis=axis, tiled=True), grad_fn
|
|
|
|
|
|
|
|
return f(x)
|
|
|
|
|
|
|
|
# gather params back - e.g. when computing a module forward call
|
|
|
|
# reverse operation of "shard_params"
|
|
|
|
# depends on: gather_array_with_mean_grads
|
|
|
|
@jax.named_scope("gather_params")
|
|
|
|
def gather_params(params: PyTree, axis_name: str) -> PyTree:
|
|
|
|
"""Gather parameters from all replicas across the given axis.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
params: The parameters to gather.
|
|
|
|
axis_name: The axis to gather parameters across.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
PyTree of same structure as params, but with leaves gathered if they were a nn.Partitioned object.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def _gather(p: Parameter) -> Parameter:
|
|
|
|
if isinstance(p, nn.Partitioned) and axis_name in p.names:
|
|
|
|
param_shard = p.names
|
|
|
|
shard_axis = param_shard.index(axis_name)
|
|
|
|
value = gather_array_with_mean_grads(p.value, axis=shard_axis, axis_name=axis_name)
|
|
|
|
|
|
|
|
# If there are any other axes that are sharded, we need to keep the partitioned structure.
|
|
|
|
# Otherwise, we can return the value directly.
|
|
|
|
param_shard = param_shard[:shard_axis] + (None,) + param_shard[shard_axis + 1 :]
|
|
|
|
if any([name is not None for name in param_shard]):
|
|
|
|
# we return the still-sharded axes shard
|
|
|
|
return nn.Partitioned(value, param_shard)
|
|
|
|
else:
|
|
|
|
return value
|
|
|
|
else:
|
|
|
|
return p
|
|
|
|
|
|
|
|
# we find all the sharded params and gather them, returning a complete parameter
|
|
|
|
return jax.tree.map(
|
|
|
|
_gather,
|
|
|
|
params,
|
|
|
|
is_leaf=lambda x: isinstance(x, nn.Partitioned))
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# when we call a module, we gather the parameters back to a single device
|
|
|
|
# wrap a module into a nn.map_variables transform
|
|
|
|
# allows for transforms on the parameter before and after a module call
|
|
|
|
# depends on: gather_params, shard_params
|
|
|
|
def shard_module_params(
|
|
|
|
target: nn.Module | Callable,
|
|
|
|
axis_name: str,
|
|
|
|
min_weight_size: int = 2**18 # 262,144
|
|
|
|
) -> nn.Module | Callable:
|
|
|
|
"""Shard parameters of a module across replicas.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
target: The module to shard.
|
|
|
|
axis_name: The axis name to shard parameters across.
|
|
|
|
min_weight_size: The minimum size of a parameter to shard. Parameters with fewer values will not be sharded.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
The module with sharded parameters.
|
|
|
|
"""
|
|
|
|
return nn.map_variables(
|
|
|
|
target,
|
|
|
|
trans_in_fn=functools.partial(
|
|
|
|
gather_params, axis_name=axis_name),
|
|
|
|
trans_out_fn=functools.partial(
|
|
|
|
shard_params, axis_name=axis_name, min_weight_size=min_weight_size
|
|
|
|
),
|
|
|
|
mapped_collections="params",
|
|
|
|
mutable=True,
|
|
|
|
)
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# define new function with axes constraints
|
|
|
|
# this forms the template for sharding future modules
|
|
|
|
# remember, flax modules are subclassed from elementary flax modules
|
|
|
|
class FSDPClassifier(nn.Module):
|
|
|
|
config: ConfigDict
|
|
|
|
|
|
|
|
@nn.compact
|
|
|
|
def __call__(self, x: jax.Array, train: bool) -> jax.Array:
|
|
|
|
# create a sharded module
|
|
|
|
sharded_dense = shard_module_params(
|
|
|
|
nn.Dense,
|
|
|
|
axis_name=self.config.data_axis_name, # axes
|
|
|
|
min_weight_size=self.config.min_weight_size, # min_weight
|
|
|
|
)
|
|
|
|
x = sharded_dense(
|
|
|
|
features=self.config.hidden_size,
|
|
|
|
dtype=self.config.dtype,
|
|
|
|
name="input_dense",
|
|
|
|
)(x)
|
|
|
|
x = nn.silu(x)
|
|
|
|
x = nn.Dropout(rate=self.config.dropout_rate, deterministic=not train)(x)
|
|
|
|
x = sharded_dense(
|
|
|
|
features=self.config.num_classes,
|
|
|
|
dtype=self.config.dtype,
|
|
|
|
name="output_dense",
|
|
|
|
)(x)
|
|
|
|
x = x.astype(jnp.float32)
|
|
|
|
return x
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# initialization
|
|
|
|
config.model.min_weight_size = 2**4
|
|
|
|
model_fsdp = FSDPClassifier(config=config.model)
|
|
|
|
|
|
|
|
# the earlier init function
|
|
|
|
def init_dp(rng: jax.random.PRNGKey, x: jax.Array, model: nn.Module) -> TrainState:
|
|
|
|
init_rng, rng = jax.random.split(rng)
|
|
|
|
variables = model.init({"params": init_rng}, x, train=False)
|
|
|
|
params = variables.pop("params")
|
|
|
|
state = TrainState.create(
|
|
|
|
apply_fn=model.apply,
|
|
|
|
params=params,
|
|
|
|
tx=optimizer,
|
|
|
|
rng=rng,
|
|
|
|
)
|
|
|
|
return state
|
|
|
|
|
|
|
|
|
|
|
|
# initialize our sharded model with mesh
|
|
|
|
# we need to adjust the shard map since partitioning is determined within the
|
|
|
|
# model init, hence we cannot manually specify it
|
|
|
|
#
|
|
|
|
# we do a hack where we just try and let it evaluate the shapes
|
|
|
|
# we set an unknown output specification - aka fully replicate
|
|
|
|
#
|
|
|
|
# we then get the partition_spec of the shapes of the parameters
|
|
|
|
init_fsdp_fn = shard_map(
|
|
|
|
functools.partial(init_dp, model=model_fsdp),
|
|
|
|
mesh,
|
|
|
|
# first P() is for model_init_rng
|
|
|
|
# second P(config.data_axis_name) is for batch.inputs
|
|
|
|
in_specs=(P(), P(config.data_axis_name)),
|
|
|
|
# not partitioned, fully replicated
|
|
|
|
out_specs=P(),
|
|
|
|
check_rep=False, # disable checks for replication errors in out_specs
|
|
|
|
)
|
|
|
|
state_fsdp_shapes = jax.eval_shape(init_fsdp_fn, model_init_rng, batch.inputs)
|
|
|
|
state_fsdp_specs = nn.get_partition_spec(state_fsdp_shapes)
|
|
|
|
# %% [raw]
|
|
|
|
# TrainState(step=PartitionSpec(), apply_fn=<bound method Module.apply of FSDPClassifier(
|
|
|
|
# # attributes
|
|
|
|
# config = data_axis_name: data
|
|
|
|
# dropout_rate: 0.1
|
|
|
|
# dtype: !!python/name:jax.numpy.bfloat16 ''
|
|
|
|
# hidden_size: 512
|
|
|
|
# min_weight_size: 16
|
|
|
|
# num_classes: 10
|
|
|
|
#
|
|
|
|
# )>,
|
|
|
|
# params={
|
|
|
|
# 'input_dense': {'bias': PartitionSpec('data',), 'kernel': PartitionSpec('data', None)},
|
|
|
|
# 'output_dense': {'bias': PartitionSpec(), 'kernel': PartitionSpec('data', None)}},
|
|
|
|
# tx=GradientTransformationExtraArgs(init=<function chain.<locals>.init_fn at 0x761e8ef00400>,
|
|
|
|
# update=<function chain.<locals>.update_fn at 0x761e8ef01080>),
|
|
|
|
# opt_state=(ScaleByAdamState(count=PartitionSpec(),
|
|
|
|
# mu={'input_dense': {'bias': PartitionSpec('data',), 'kernel': PartitionSpec('data', None)},
|
|
|
|
# 'output_dense': {'bias': PartitionSpec(), 'kernel': PartitionSpec('data', None)}},
|
|
|
|
# nu={'input_dense': {'bias': PartitionSpec('data',), 'kernel': PartitionSpec('data', None)},
|
|
|
|
# 'output_dense': {'bias': PartitionSpec(), 'kernel': PartitionSpec('data', None)}}),
|
|
|
|
# EmptyState(), EmptyState()), rng=PartitionSpec())
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# then from the state_fsdp_specs, we obtain our config
|
|
|
|
# this print clarifies everything -> the reason why earlier we do not know the
|
|
|
|
# partitionspec is because we only know which parameters gets to be sharded at
|
|
|
|
# model init
|
|
|
|
print("RNG", state_fsdp_specs.rng)
|
|
|
|
print("\nParameters")
|
|
|
|
pprint(state_fsdp_specs.params)
|
|
|
|
print("\nOptimizer state")
|
|
|
|
pprint(state_fsdp_specs.opt_state[0])
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# init again, this time with the specs and knowledge of what is and should not
|
|
|
|
# be sharded
|
|
|
|
init_fsdp_fn = jax.jit(
|
|
|
|
shard_map(
|
|
|
|
functools.partial(init_dp, model=model_fsdp),
|
|
|
|
mesh,
|
|
|
|
in_specs=(P(), P(config.data_axis_name)),
|
|
|
|
out_specs=state_fsdp_specs,
|
|
|
|
check_rep=False,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
state_fsdp = init_fsdp_fn(model_init_rng, batch.inputs)
|
|
|
|
|
|
|
|
# %%
|
|
|
|
print("FSDP Parameters")
|
|
|
|
pprint(jax.tree.map(lambda x: x.shape, jax.device_get(state_fsdp.params)))
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# train step
|
|
|
|
|
|
|
|
# we need to handle the sync of gradients
|
|
|
|
# some parameters are sharded, some are not
|
|
|
|
def sync_gradients(
|
|
|
|
grads: PyTree,
|
|
|
|
axis_names: Sequence[str],
|
|
|
|
) -> PyTree:
|
|
|
|
"""Synchronize gradients across devices.
|
|
|
|
|
|
|
|
Gradients for parameters that are replicated over a given axis are averaged across devices.
|
|
|
|
Parameters that are partitioned over a given axis are considered to already have a mean of
|
|
|
|
the gradients on each device, and hence do not need to be altered.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
grads: The gradients to synchronize.
|
|
|
|
axis_names: The axis names to synchronize gradients across.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
The gradients averaged over the specified axes if they are replicated.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def sync_grad(g: Parameter) -> Parameter:
|
|
|
|
if isinstance(g, nn.Partitioned):
|
|
|
|
# Tree leaves for flattening potentially nested axis (multiple names
|
|
|
|
# can exist for single array axis).
|
|
|
|
replication_axis_names = [
|
|
|
|
name for name in axis_names if name not in jax.tree_util.tree_leaves(g.names)
|
|
|
|
]
|
|
|
|
if len(replication_axis_names) == 0:
|
|
|
|
# Parameters partitioned over all axes.
|
|
|
|
return g
|
|
|
|
else:
|
|
|
|
# Average over remaining replicated axes.
|
|
|
|
return g.replace(value=jax.lax.pmean(g.value, axis_name=replication_axis_names))
|
|
|
|
else:
|
|
|
|
# Parameters are replicated over all axes.
|
|
|
|
return jax.lax.pmean(g, axis_name=axis_names)
|
|
|
|
|
|
|
|
return jax.tree.map(
|
|
|
|
sync_grad,
|
|
|
|
grads,
|
|
|
|
is_leaf=lambda x: isinstance(x, nn.Partitioned))
|
|
|
|
|
|
|
|
# %%
|
|
|
|
def train_step_fsdp(
|
|
|
|
state: TrainState,
|
|
|
|
metrics: Metrics,
|
|
|
|
batch: Batch,
|
|
|
|
) -> Tuple[TrainState, Metrics]:
|
|
|
|
rng, step_rng = jax.random.split(state.rng)
|
|
|
|
# perform one forward pass
|
|
|
|
grads, step_metrics = accumulate_gradients(
|
|
|
|
state,
|
|
|
|
batch,
|
|
|
|
step_rng,
|
|
|
|
config.optimizer.num_minibatches,
|
|
|
|
loss_fn=loss_fn,
|
|
|
|
)
|
|
|
|
# Update parameters. We need to sync the gradients across devices before updating.
|
|
|
|
with jax.named_scope("sync_gradients"):
|
|
|
|
grads = sync_gradients(grads, (config.data_axis_name,))
|
|
|
|
# then update model
|
|
|
|
new_state = state.apply_gradients(grads=grads, rng=rng)
|
|
|
|
|
|
|
|
# Sum metrics across replicas. Alternatively, we could keep the metrics separate
|
|
|
|
# and only synchronize them before logging. For simplicity, we sum them here.
|
|
|
|
with jax.named_scope("sync_metrics"):
|
|
|
|
step_metrics = jax.tree.map(
|
|
|
|
lambda x: jax.lax.psum(x, axis_name=config.data_axis_name), step_metrics
|
|
|
|
)
|
|
|
|
if metrics is None:
|
|
|
|
metrics = step_metrics
|
|
|
|
else:
|
|
|
|
metrics = jax.tree.map(jnp.add, metrics, step_metrics)
|
|
|
|
return new_state, metrics
|
|
|
|
|
|
|
|
# %%
|
|
|
|
# jit the train_step_fsdp
|
|
|
|
train_step_fsdp_fn = jax.jit(
|
|
|
|
shard_map(
|
|
|
|
train_step_fsdp,
|
|
|
|
mesh,
|
|
|
|
in_specs=(state_fsdp_specs, P(), P(config.data_axis_name)),
|
|
|
|
out_specs=(state_fsdp_specs, P()),
|
|
|
|
check_rep=False,
|
|
|
|
),
|
|
|
|
donate_argnames=("state", "metrics"),
|
|
|
|
)
|
|
|
|
|
|
|
|
# get the metric shape to initialize accumulator arrays for metrics
|
|
|
|
_, metric_shapes = jax.eval_shape(
|
|
|
|
train_step_fsdp_fn,
|
|
|
|
state_fsdp,
|
|
|
|
None,
|
|
|
|
batch,
|
|
|
|
)
|
|
|
|
metrics_fsdp = jax.tree.map(lambda x: jnp.zeros(x.shape, dtype=x.dtype), metric_shapes)
|
|
|
|
# %%
|
|
|
|
# train
|
|
|
|
start_time = time.time()
|
|
|
|
for _ in range(15):
|
|
|
|
state_fsdp, metrics_fsdp = train_step_fsdp_fn(
|
|
|
|
state_fsdp,
|
2024-09-16 12:19:07 +09:00
|
|
|
metrics_fsdp,
|
|
|
|
batch)
|
2024-09-15 22:41:00 +09:00
|
|
|
duration = time.time() - start_time
|
|
|
|
print(duration)
|
|
|
|
|
|
|
|
# get metrics and state
|
|
|
|
final_metrics_fsdp = jax.tree.map(
|
|
|
|
lambda x: jnp.zeros(x.shape, dtype=x.dtype),
|
|
|
|
metric_shapes)
|
|
|
|
state_fsdp, final_metrics_fsdp = train_step_fsdp_fn(
|
|
|
|
state_fsdp,
|
2024-09-16 12:19:07 +09:00
|
|
|
final_metrics_fsdp,
|
|
|
|
batch)
|
2024-09-15 22:41:00 +09:00
|
|
|
print_metrics(final_metrics_fsdp, "FSDP - Final metrics")
|
|
|
|
|
|
|
|
|
|
|
|
# %%
|