Feat: flax pjit example

2024-09-16 12:19:07 +09:00 · 2024-09-16 12:19:07 +09:00 · 429e1742ab
parent ad5cf7735f
commit 429e1742ab
5 changed files with 1078 additions and 63 deletions
--- a/parallel/dataload.py
+++ b/parallel/dataload.py
@ -98,21 +98,21 @@ class DataPrepare():
        )

        # check that data fits
-        for name in ['input_ids', 'attention_mask', 'labels', 'decoder_input_ids', 'decoder_attention_mask']:
-            int_array: np.array = train_dataset[name]
-            if np.all((int_array >= 0) & (int_array <= 65535)):
-                continue
-            else:
-                raise ValueError("Values are out of range for uint16")
+        # for name in ['input_ids', 'attention_mask', 'labels', 'decoder_input_ids', 'decoder_attention_mask']:
+        #     int_array: np.array = train_dataset[name]
+        #     if np.all((int_array >= 0) & (int_array <= 65535)):
+        #         continue
+        #     else:
+        #         raise ValueError("Values are out of range for uint16")

        # change to compact datatypes
-        features = train_dataset.features.copy()
-        features['input_ids'] = Sequence(Value('uint16'))
-        features['attention_mask'] = Sequence(Value('bool'))
-        features['labels'] = Sequence(Value('uint16'))
-        features['decoder_input_ids'] = Sequence(Value('uint16'))
-        features['decoder_attention_mask'] = Sequence(Value('bool'))
-        train_dataset = train_dataset.cast(features)
+        # features = train_dataset.features.copy()
+        # features['input_ids'] = Sequence(Value('uint16'))
+        # features['attention_mask'] = Sequence(Value('uint16'))
+        # features['labels'] = Sequence(Value('uint16'))
+        # features['decoder_input_ids'] = Sequence(Value('uint16'))
+        # features['decoder_attention_mask'] = Sequence(Value('uint16'))
+        # train_dataset = train_dataset.cast(features)
        # assign the dataset to train_dataset
        self.train_dataset = train_dataset

@ -140,15 +140,15 @@ class DataPrepare():

        for idx in batch_idx:
            batch = dataset[idx]
-            batch = {k: jnp.array(v) for k, v in batch.items()}
+            batch = {k: v for k, v in batch.items()}

            yield batch


 # testing out the class
-# # %%
-# # init object
-# # e.g. Config
+# %%
+# init object
+# e.g. Config
 # data_config = ConfigDict(
 #     dict(
 #         max_length=86,
@ -157,7 +157,9 @@ class DataPrepare():
 #     )
 # )
 # 
-# dataprep = DataPrepare(split_datasets, data_config)
+# from datasets import load_from_disk
+# split_datasets = load_from_disk(file_path)
+# dataprep = DataPrepare(split_datasets['train'], data_config)
 # 
 # # %%
 # seed = 117
--- a/parallel/flax_pjit_tutorial.py
+++ b/parallel/flax_pjit_tutorial.py
@ -0,0 +1,456 @@
+# MARK: START
+# %%
+# let's make 8-device simulator
+import os
+
+# Set this to True to run the model on CPU only.
+USE_CPU_ONLY = True
+
+flags = os.environ.get("XLA_FLAGS", "")
+if USE_CPU_ONLY:
+    flags += " --xla_force_host_platform_device_count=4"  # Simulate 8 devices
+    # Enforce CPU-only execution
+    os.environ["CUDA_VISIBLE_DEVICES"] = ""
+    os.environ["JAX_PLATFORMS"] = "cpu"
+else:
+    # GPU flags
+    flags += (
+        "--xla_gpu_enable_triton_softmax_fusion=true "
+        "--xla_gpu_triton_gemm_any=false "
+        "--xla_gpu_enable_async_collectives=true "
+        "--xla_gpu_enable_latency_hiding_scheduler=true "
+        "--xla_gpu_enable_highest_priority_async_stream=true "
+    )
+os.environ["XLA_FLAGS"] = flags
+
+import functools
+from functools import partial
+from pprint import pprint
+from typing import Any, Dict, Tuple, Callable, Sequence
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from jax.experimental.shard_map import shard_map
+from jax.sharding import Mesh, NamedSharding
+# from jax.experimental.pjit import pjit  # superseded by jax.jit
+from jax.experimental import mesh_utils
+from jax.sharding import PartitionSpec
+from ml_collections import ConfigDict
+import optax
+import logging
+import time
+from datasets import Dataset, load_from_disk
+
+from flax import jax_utils, traverse_util
+from flax.jax_utils import pad_shard_unpad, unreplicate
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
+import flax.core
+
+from tqdm import tqdm
+
+from dataload import DataPrepare
+
+PyTree = Any
+Metrics = Dict[str, Tuple[jax.Array, ...]]
+
+if USE_CPU_ONLY:
+    jax.config.update('jax_platform_name', 'cpu')
+else:
+    jax.config.update("jax_default_matmul_precision", "bfloat16")
+
+
+# MARK: sharding example
+# %%
+device_mesh = mesh_utils.create_device_mesh((2,2))
+print(device_mesh)
+
+mesh = Mesh(devices=device_mesh, axis_names=('data', 'model'))
+print(mesh)
+
+def mesh_sharding(pspec: PartitionSpec) -> NamedSharding:
+    return NamedSharding(mesh, pspec)
+
+# %%
+# define a layer
+class DotReluDot(nn.Module):
+    depth: int
+    dense_init: Callable = nn.initializers.xavier_normal()
+    @nn.compact
+    def __call__(self, x):
+
+        # y has shape (x.shape[-1], self.depth)
+        # we replicate data across devices
+        # but we shard the layer across the model axes
+        y = nn.Dense(self.depth,
+                    kernel_init=nn.with_partitioning(self.dense_init, (None, 'model')),
+                    use_bias=False,  # or overwrite with `bias_init`
+                    )(x)
+
+        y = jax.nn.relu(y)
+        # Force a local sharding annotation.
+        # annotate intermediate variables to force a particular sharding pattern
+        # when ideal constraint is known
+        y = jax.lax.with_sharding_constraint(y, mesh_sharding(PartitionSpec('data', 'model')))
+
+        W2 = self.param(
+            'W2',
+            nn.with_partitioning(self.dense_init, ('model', None)),
+            (self.depth, x.shape[-1]))
+
+        z = jnp.dot(y, W2)
+        # Force a local sharding annotation.
+        z = jax.lax.with_sharding_constraint(z, mesh_sharding(PartitionSpec('data', None)))
+
+        # Return a tuple to conform with the API `flax.linen.scan` as shown in the cell below.
+        return z, None
+
+# %%
+class MLP(nn.Module):
+    num_layers: int
+    depth: int
+    use_scan: bool
+    @nn.compact
+    def __call__(self, x):
+        if self.use_scan:
+            x, _ = nn.scan(
+                DotReluDot,
+                length=self.num_layers,
+                variable_axes={"params": 0},
+                split_rngs={"params": True},
+                metadata_params={nn.PARTITION_NAME: None}
+            )(self.depth)(x)
+        else:
+            for _ in range(self.num_layers):
+                x, _ = DotReluDot(self.depth)(x)
+            return x
+
+# %%
+# MLP hyperparameters.
+BATCH, LAYERS, DEPTH, USE_SCAN = 8, 4, 1024, False
+# Create fake inputs.
+x = jnp.ones((BATCH, DEPTH))
+# Initialize a PRNG key.
+k = jax.random.key(117)
+
+# Create an Optax optimizer.
+optimizer = optax.adam(learning_rate=0.001)
+# Instantiate the model.
+model = MLP(LAYERS, DEPTH, USE_SCAN)
+
+# %%
+# specify sharding
+
+# shard data
+x_sharding = mesh_sharding(PartitionSpec('data', None))  # replicate across data axis
+x = jax.device_put(x, x_sharding)
+jax.debug.visualize_array_sharding(x)
+
+# shard output
+# we will shard state by tracking its output upon jax.eval_shape after init
+# define an init function to return a TrainState
+def init_fn(key, x, model, optimizer):
+    # do be careful with the model init
+    # imported models might have complicated init methods
+    variables = model.init(key, x)  # Initialize the model.
+    state = train_state.TrainState.create(  # Create a `TrainState`.
+        apply_fn=model.apply,
+        params=variables['params'],
+        tx=optimizer)
+    return state
+
+# Create an abstract closure to wrap the function before feeding it in
+# because `jax.eval_shape` only takes pytrees as arguments.
+# eval_shape(fn, rng_key, x)
+# used to perform shape inference
+# returns a nested PyTree containing jax.ShapeDtypeStruct objects as leaves
+abstract_variables = jax.eval_shape(
+    functools.partial(init_fn, model=model, optimizer=optimizer), k, x)
+
+
+# This `state_sharding` has the same pytree structure as `state`, the output
+# of the `init_fn`.
+# flan.linen.get_sharding
+# extracts a jax.sharding tree from a PyTree containing Partitioned values and a mesh
+# jax.sharding: describes how a jax.Array is laid out across devices
+state_sharding = nn.get_sharding(abstract_variables, mesh)
+print(state_sharding)
+
+# %%
+jit_init_fn = jax.jit(
+    init_fn,
+    static_argnames=('model', 'optimizer'),  # skip model and optimizer
+    in_shardings=(mesh_sharding(()), x_sharding),  # for PRNG key and data
+    out_shardings=state_sharding
+)
+
+initialized_state = jit_init_fn(k, x, model, optimizer)
+# for weight, partitioned in initialized_state.params['DotReluDot_0'].items():
+#     print(f'Sharding of {weight}: {partitioned.names}')
+jax.debug.visualize_array_sharding(initialized_state.params['DotReluDot_0']['Dense_0']['kernel'].value)
+jax.debug.visualize_array_sharding(initialized_state.params['DotReluDot_0']['W2'].value)
+
+# %%
+# inspect module output
+# the params are actually linen.Partitioned objects
+# the Partition objects are actually wrapping jax.Array
+print(type(initialized_state.params['DotReluDot_0']['Dense_0']['kernel']))
+print(type(initialized_state.params['DotReluDot_0']['Dense_0']['kernel'].value))
+print(initialized_state.params['DotReluDot_0']['Dense_0']['kernel'].names)
+print(initialized_state.params['DotReluDot_0']['Dense_0']['kernel'].value.shape)
+# %%
+# Say for some unknown reason you want to make the whole param tree all-zero
+unboxed_params = nn.meta.unbox(initialized_state.params)
+all_zero = jax.tree.map(jnp.zeros_like, unboxed_params)
+all_zero_params = nn.meta.replace_boxed(initialized_state.params, all_zero)
+assert jnp.sum(nn.meta.unbox(all_zero_params['DotReluDot_0']['Dense_0']['kernel'])) == 0
+# %%
+# check the jax.sharding of each parameter
+print(initialized_state.params['DotReluDot_0']['Dense_0']['kernel'].value.sharding)
+print(initialized_state.step)
+print(initialized_state.step.sharding)
+# %%
+# example of computation on pytree
+diff = jax.tree.map(
+    lambda a, b: a - b,
+    initialized_state.params['DotReluDot_0'],
+    initialized_state.params['DotReluDot_0']
+)
+print(jax.tree.map(jnp.shape, diff))
+diff_array = diff['Dense_0']['kernel'].value
+print(type(diff_array))
+print(diff_array.shape)
+# %%
+# compile train step
+@functools.partial(
+    jax.jit,
+    in_shardings=(state_sharding, x_sharding),
+    out_shardings=state_sharding                
+)
+def train_step(state, x):
+    def loss_unrolled(params):
+        y = model.apply({'params': params}, x)
+        return y.sum()
+    grad_fn = jax.grad(loss_unrolled)
+    grads = grad_fn(state.params)
+    state = state.apply_gradients(grads=grads)
+    return state
+
+# this trains for 1 step
+# with mesh: # not strictly necessary in this case
+# with mesh block is useful for explicit scope for device sharding
+# but mesh management is automatic via jit sharding annotations
+new_state = train_step(initialized_state, x)
+
+print(f'Sharding of Weight 1:')
+jax.debug.visualize_array_sharding(initialized_state.params['DotReluDot_0']['Dense_0']['kernel'].value)
+print(f'Sharding of Weight 2:')
+jax.debug.visualize_array_sharding(initialized_state.params['DotReluDot_0']['W2'].value)
+
+
+# %%
+# compile inference step
+@functools.partial(
+    jax.jit, 
+    in_shardings=(state_sharding, x_sharding),
+    out_shardings=x_sharding
+)
+def apply_fn(state, x):
+  return state.apply_fn({'params': state.params}, x)
+
+# this infers for 1 step
+with mesh:
+  y = apply_fn(new_state, x)
+print(type(y))
+print(y.dtype)
+print(y.shape)
+jax.debug.visualize_array_sharding(y)
+# %%
+# profiling
+# measure performance
+import timeit
+def block_all(xs):
+  jax.tree_util.tree_map(lambda x: x.block_until_ready(), xs)
+  return xs
+
+# with mesh:
+t = timeit.timeit("block_all(train_step(initialized_state, x))", globals=globals(), number=10)
+print(t)
+
+# MARK: logical sharding
+# %%
+# logical axis annotation
+# why? 
+# rather than just be fixed with 'data' and 'model', we can annotate with logical names
+# then map these logical names back to 'data' and 'model', 
+# or be more flexible with more axes
+#
+# we will substitute with the following:
+# flax.linen.with_partitioning -> flax.linen.with_logical_partitioning
+# flax.lax.with_sharding_constraint -> flax.linen.with_logical_constraint
+
+class LogicalDotReluDot(nn.Module):
+  depth: int
+  dense_init: Callable = nn.initializers.xavier_normal()
+  @nn.compact
+  def __call__(self, x):
+    y = nn.Dense(
+        self.depth,
+        # use of logical partitioning here
+        # kernel_init is the initializer function for the weight matrix
+        kernel_init=nn.with_logical_partitioning(self.dense_init, ('embed', 'hidden')),
+        use_bias=False,  # or overwrite with `bias_init`
+        )(x)
+
+    y = jax.nn.relu(y)
+    # Force a local sharding annotation.
+    y = jax.lax.with_sharding_constraint(y, mesh_sharding(PartitionSpec('data', 'model')))
+
+    W2 = self.param(
+        'W2',
+        nn.with_logical_partitioning(self.dense_init, ('hidden', 'embed')),
+        (self.depth, x.shape[-1]))
+
+    z = jnp.dot(y, W2)
+    # Force a local sharding annotation.
+    z = nn.with_logical_constraint(z, ('batch', 'embed'))
+    return z, None
+
+class LogicalMLP(nn.Module):
+  num_layers: int
+  depth: int
+  use_scan: bool
+  @nn.compact
+  def __call__(self, x):
+    if self.use_scan:
+      x, _ = nn.scan(LogicalDotReluDot, length=self.num_layers,
+                    variable_axes={"params": 0},
+                    split_rngs={"params": True},
+                    metadata_params={nn.PARTITION_NAME: 'layer'}
+                    )(self.depth)(x)
+    else:
+      for _ in range(self.num_layers):
+        x, _ = LogicalDotReluDot(self.depth)(x)
+    return x
+
+# %%
+# we initialize the model with the eval_shape method
+# but we need to perform a rule to replace logical axes with real axes
+rules =(
+    ('batch', 'data'),
+    ('hidden', 'model')
+)
+
+logical_model = LogicalMLP(LAYERS, DEPTH, USE_SCAN)
+
+logical_abstract_variables = jax.eval_shape(
+    functools.partial(init_fn, model=logical_model, optimizer=optimizer),
+    k,
+    x,
+)
+
+# linen.get_partition_spec
+# extracts a partitionspec tree from a pytree containing partitioned values
+# linen.Partitioned
+# wrapper for partitioning metadata
+logical_state_spec = nn.get_partition_spec(logical_abstract_variables)
+print(
+    "annotations are logical, not mesh specific",
+    logical_state_spec.params['LogicalDotReluDot_0']['Dense_0']['kernel']
+)
+
+# we convert our logical_state_spec to a logical_state_sharding
+# with defined rules
+logical_state_sharding = nn.logical_to_mesh_sharding(
+    logical_state_spec,
+    mesh,
+    rules
+)
+print('sharding annotations are mesh-specific: ',
+      logical_state_sharding.params['LogicalDotReluDot_0']['Dense_0']['kernel'].spec)
+
+# %%
+# with a working state_sharding object, we can init
+logical_jit_init_fn = jax.jit(init_fn, static_argnums=(2, 3),
+                      in_shardings=(mesh_sharding(()), x_sharding),  # PRNG key and x
+                      out_shardings=logical_state_sharding)
+
+logical_initialized_state = logical_jit_init_fn(k, x, logical_model, optimizer)
+
+# %%
+# MARK: saving checkpoint
+# https://flax.readthedocs.io/en/latest/guides/training_techniques/use_checkpointing.html#multi-host-multi-process-checkpointing
+# let us save the model
+# since we already have a mesh, we will skip the making of the mesh
+
+from typing import Optional, Any
+import shutil
+
+import numpy as np
+import jax
+from jax import random, numpy as jnp
+
+import flax
+from flax import linen as nn
+from flax.training import checkpoints, train_state
+from flax import struct, serialization
+import orbax.checkpoint
+
+import optax
+
+ckpt_dir = '/tmp/flax_ckpt'
+
+if os.path.exists(ckpt_dir):
+    shutil.rmtree(ckpt_dir)  # Remove any existing checkpoints from the last notebook run.
+
+# %%
+# make up some stuff
+# A simple model with one linear layer.
+key1, key2 = random.split(random.key(0))
+x1 = random.normal(key1, (5,))      # A simple JAX array.
+model = nn.Dense(features=3)
+variables = model.init(key2, x1)
+
+# Flax's TrainState is a pytree dataclass and is supported in checkpointing.
+# Define your class with `@flax.struct.dataclass` decorator to make it compatible.
+tx = optax.sgd(learning_rate=0.001)      # An Optax SGD optimizer.
+state = train_state.TrainState.create(
+    apply_fn=model.apply,
+    params=variables['params'],
+    tx=tx)
+# Perform a simple gradient update similar to the one during a normal training workflow.
+state = state.apply_gradients(grads=jax.tree_util.tree_map(jnp.ones_like, state.params))
+
+# Some arbitrary nested pytree with a dictionary and a NumPy array.
+config = {'dimensions': np.array([5, 3])}
+
+# Bundle everything together.
+ckpt = {'model': state, 'config': config, 'data': [x1]}
+
+# %%
+# single host save with orbax
+from flax.training import orbax_utils
+
+orbax_checkpointer = orbax.checkpoint.PyTreeCheckpointer()
+save_args = orbax_utils.save_args_from_target(ckpt)
+orbax_checkpointer.save('/tmp/flax_ckpt/orbax/single_save', ckpt, save_args=save_args)
+
+
+# %%
+# multi-process checkpointing
+# aka checkpointing for sharding
+# The reference doesn't need to be as large as your checkpoint!
+# Just make sure it has the `.sharding` you want.
+# https://orbax.readthedocs.io/en/latest/guides/checkpoint/async_checkpointing.html
+# https://orbax.readthedocs.io/en/latest/guides/checkpoint/orbax_checkpoint_101.html
+
+import orbax.checkpoint as ocp
+from etils import epath
+
+path = epath.Path('/tmp/async_checkpoint')
+ckptr = ocp.AsyncCheckpointer(ocp.StandardCheckpointHandler())
+ckptr.save(path, args=ocp.args.StandardSave(train_state))
+### Do some other work...
+ckptr.wait_until_finished()
--- a/parallel/fully_sharded_data_parallelism.py
+++ b/parallel/fully_sharded_data_parallelism.py
@ -737,7 +737,8 @@ start_time = time.time()
 for _ in range(15):
    state_fsdp, metrics_fsdp = train_step_fsdp_fn(
        state_fsdp,
-        metrics_fsdp, batch)
+        metrics_fsdp, 
+        batch)
 duration = time.time() - start_time
 print(duration)

@ -747,7 +748,8 @@ final_metrics_fsdp = jax.tree.map(
    metric_shapes)
 state_fsdp, final_metrics_fsdp = train_step_fsdp_fn(
    state_fsdp,
-    final_metrics_fsdp, batch)
+    final_metrics_fsdp,
+    batch)
 print_metrics(final_metrics_fsdp, "FSDP - Final metrics")


--- a/parallel/t5_jax_train_pjit.py
+++ b/parallel/t5_jax_train_pjit.py
@ -8,7 +8,7 @@
 import os

 # Set this to True to run the model on CPU only.
-USE_CPU_ONLY = True
+USE_CPU_ONLY = False

 flags = os.environ.get("XLA_FLAGS", "")
 if USE_CPU_ONLY:
@ -20,10 +20,10 @@ else:
    # GPU flags
    flags += (
        "--xla_gpu_enable_triton_softmax_fusion=true "
-        "--xla_gpu_triton_gemm_any=false "
-        "--xla_gpu_enable_async_collectives=true "
-        "--xla_gpu_enable_latency_hiding_scheduler=true "
-        "--xla_gpu_enable_highest_priority_async_stream=true "
+        # "--xla_gpu_triton_gemm_any=false "
+        # "--xla_gpu_enable_async_collectives=true "
+        # "--xla_gpu_enable_latency_hiding_scheduler=true "
+        # "--xla_gpu_enable_highest_priority_async_stream=true "
    )
 os.environ["XLA_FLAGS"] = flags

@ -37,7 +37,8 @@ import jax
 import jax.numpy as jnp
 import numpy as np
 from jax.experimental.shard_map import shard_map
-from jax.sharding import Mesh, NamedSharding
+from jax.sharding import Mesh
+from jax.experimental.pjit import pjit
 from jax.sharding import PartitionSpec as P
 from ml_collections import ConfigDict
 import optax
@ -148,7 +149,7 @@ dataprep = DataPrepare(split_datasets['train'], data_config)
 # seed = 117
 # rng = jax.random.PRNGKey(seed)
 # train_loader = dataprep.data_loader(rng, batch_size=1)
-
+# batch = next(iter(train_loader))

 # %%
 # model
@ -161,8 +162,8 @@ config = T5Config()
 # If you want don't want to cast certain parameters (for example layer norm bias and scale)
 # then pass the mask as follows
 from flax import traverse_util
-
 model = FlaxT5ForConditionalGeneration.from_pretrained("t5-base")
+
 # useful for transformer model
 model.enable_gradient_checkpointing()

@ -174,6 +175,21 @@ mask = {
 mask = traverse_util.unflatten_dict(mask)
 model.params = model.to_bf16(model.params, mask)

+# %%
+
+
+# %%
+from jax.sharding import Mesh, NamedSharding
+from jax.experimental import mesh_utils
+from jax.sharding import PartitionSpec as P
+from pjit_partition import set_partitions
+
+devices = np.asarray(jax.devices())
+mesh_axis_names = ('data')
+mesh = Mesh(devices, 'batch')
+sharding = NamedSharding(mesh, P(mesh_axis_names))
+replicated_sharding = NamedSharding(mesh, P())
+

 # %% [markdown]
 # # Model
@ -243,22 +259,9 @@ adamw = optax.adamw(


 # %%
-# Training functions
-class TrainState(train_state.TrainState):
-    dropout_rng: jnp.ndarray
+# state will serve as our "params"
+state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw, dropout_rng=dropout_rng)

-    # easy way to achieve data parallelism
-    # also achieves folding of rng keys
-    def replicate(self):
-        return jax_utils.replicate(self).replace(dropout_rng=shard_prng_key(self.dropout_rng))
-
-# set bf16 for model params
-# model.params = model.to_bf16(model.params)
-# Cast parameters to bfloat16 if desired
-# params = jax.tree_util.tree_map(lambda x: x.astype(jnp.bfloat16), params)
-
-# Setup train state
-state = TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw, dropout_rng=dropout_rng)

 # label smoothed cross entropy
 def loss_fn(logits, labels, padding_mask, label_smoothing_factor=0.0):
@ -283,9 +286,10 @@ def loss_fn(logits, labels, padding_mask, label_smoothing_factor=0.0):
    num_labels = padding_mask.sum()
    return loss, num_labels

+# MARK: train_step
 # Define gradient update step fn
-@jax.jit
-def train_step(state, batch, label_smoothing_factor=0.0):
+def train_step(state, batch):
+    label_smoothing_factor=0.0
    dropout_rng, new_dropout_rng = jax.random.split(state.dropout_rng)

    def compute_loss(params):
@ -311,27 +315,22 @@ def train_step(state, batch, label_smoothing_factor=0.0):
    metrics = {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}
    return new_state, metrics

-# Define generation function
-max_length = (
-    val_max_target_length if val_max_target_length is not None else model.config.max_length
-)
-num_beams = num_beams if num_beams is not None else model.config.num_beams
-gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
-
-# def generate_step(params, batch):
-#     model.params = params
-#     output_ids = model.generate(batch["input_ids"], attention_mask=batch["attention_mask"], **gen_kwargs)
-#     return output_ids.sequences
+# max_length = (
+#     val_max_target_length if val_max_target_length is not None else model.config.max_length
+# )
+# num_beams = num_beams if num_beams is not None else model.config.num_beams
+# gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

 # Create parallel version of the train and eval step
-p_train_step = jax.pmap(
-    partial(train_step, label_smoothing_factor=label_smoothing_factor), "batch", donate_argnums=(0,)
+# only state and batch
+p_train_step = jax.jit(
+    train_step,
+    # state for first, batch for second
+    in_shardings=(P("data"), P("data")),
+    out_shardings=(P("data"), P("data")),
+    donate_argnames=("state"),
 )
-# p_eval_step = jax.pmap(partial(eval_step, label_smoothing_factor=label_smoothing_factor), "batch")
-# p_generate_step = jax.pmap(generate_step, "batch")

-# Replicate the train state on each device
-state = state.replicate()



@ -349,6 +348,21 @@ print(f"  Total optimization steps = {total_train_steps}")
 # %%
 # jax.profiler.start_trace("./traces")

+# Example batch (sharded across devices)
+sharded_batch = {
+    'input_ids': jax.device_put_sharded(batch['input_ids'], devices),
+    'attention_mask': jax.device_put_sharded(batch['attention_mask'], devices),
+    'labels': jax.device_put_sharded(batch['labels'], devices),
+    'decoder_input_ids': jax.device_put_sharded(batch['decoder_input_ids'], devices),
+    'decoder_attention_mask': jax.device_put_sharded(batch['decoder_attention_mask'], devices),
+}
+
+# Initial TrainState (pjit-ted TrainState)
+sharded_state = jax.device_put_replicated(train_state, devices)
+
+# %%
+
+
 rng, input_rng = jax.random.split(rng)
 train_time = 0
 epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
@ -363,7 +377,7 @@ for epoch in epochs:
    # Generate an epoch by shuffling sampling indices from the train dataset
    for _ in tqdm(range(steps_per_epoch), desc="Training...", position=1, leave=False):
        batch = next(train_loader)
-        batch = shard(batch)
+        # batch = shard(batch)
        state, train_metric = p_train_step(state, batch)
        train_metrics.append(train_metric)

--- a/parallel/t5_jax_train_fail.py
+++ b/parallel/t5_jax_train_fail.py
@ -0,0 +1,541 @@
+# %% [markdown]
+# # T5 implementation using jax with pjit
+
+
+# MARK: START
+# %%
+# let's make 8-device simulator
+import os
+
+# Set this to True to run the model on CPU only.
+USE_CPU_ONLY = True
+
+flags = os.environ.get("XLA_FLAGS", "")
+if USE_CPU_ONLY:
+    flags += " --xla_force_host_platform_device_count=8"  # Simulate 8 devices
+    # Enforce CPU-only execution
+    os.environ["CUDA_VISIBLE_DEVICES"] = ""
+    os.environ["JAX_PLATFORMS"] = "cpu"
+else:
+    # GPU flags
+    flags += (
+        "--xla_gpu_enable_triton_softmax_fusion=true "
+        "--xla_gpu_triton_gemm_any=false "
+        "--xla_gpu_enable_async_collectives=true "
+        "--xla_gpu_enable_latency_hiding_scheduler=true "
+        "--xla_gpu_enable_highest_priority_async_stream=true "
+    )
+os.environ["XLA_FLAGS"] = flags
+
+import functools
+from functools import partial
+from pprint import pprint
+from typing import Any, Dict, Tuple, Callable, Sequence
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from jax.experimental.shard_map import shard_map
+from jax.sharding import Mesh
+from jax.experimental.pjit import pjit
+from jax.sharding import PartitionSpec as P
+from ml_collections import ConfigDict
+import optax
+import logging
+import time
+from datasets import Dataset, load_from_disk
+
+from flax import jax_utils, traverse_util
+from flax.jax_utils import pad_shard_unpad, unreplicate
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
+import flax.core
+
+from tqdm import tqdm
+
+from dataload import DataPrepare
+
+PyTree = Any
+Metrics = Dict[str, Tuple[jax.Array, ...]]
+
+if USE_CPU_ONLY:
+    jax.config.update('jax_platform_name', 'cpu')
+else:
+    jax.config.update("jax_default_matmul_precision", "bfloat16")
+
+
+# # %%
+# import jax
+# import jax.numpy as jnp
+# import optax
+# import numpy as np
+# from functools import partial
+# from typing import Callable, Optional
+# import math
+# 
+# # jax.config.update("jax_default_matmul_precision", "tensorfloat32")
+# jax.config.update("jax_default_matmul_precision", "bfloat16")
+# # jax.config.update("jax_enable_x64", False)
+# # enable cache
+# jax.config.update("jax_compilation_cache_dir", "/tmp/jax_cache")
+# jax.config.update("jax_persistent_cache_min_entry_size_bytes", -1)
+# jax.config.update("jax_persistent_cache_min_compile_time_secs", 0)
+# 
+# 
+# # from transformers import FlaxAutoModelForSeq2SeqLM, AutoConfig
+# 
+# from flax import jax_utils, traverse_util
+# from flax.jax_utils import pad_shard_unpad, unreplicate
+# from flax.training import train_state
+# from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
+# import flax.core
+
+
+# %%
+# get platform type
+from jax.lib import xla_bridge
+print(xla_bridge.get_backend().platform)
+
+# %%
+# config options
+file_path = '/home/richard/Projects/learn_t5/simple_model/combined_data_t5_retrieval'
+save_path = 't5_80_1_bf16'
+# file_path = 'combined_data'
+split_datasets = load_from_disk(file_path)
+training_size = len(split_datasets['train'])
+# Store some constant
+seed = 117
+num_epochs = 5
+batch_size = 384  # 384 is the best
+num_train_epochs = num_epochs
+per_device_train_batch_size = batch_size
+train_batch_size = per_device_train_batch_size * jax.device_count()
+per_device_eval_batch_size = batch_size
+eval_batch_size = per_device_eval_batch_size * jax.device_count()
+steps_per_epoch = training_size // train_batch_size
+total_train_steps = steps_per_epoch * num_epochs
+
+warmup_steps = 0
+learning_rate = 2e-5
+
+weight_decay = 0.01
+adam_beta1 = 0.9
+adam_beta2 = 0.999
+adam_epsilon = 1e-8
+label_smoothing_factor = 0.0
+
+num_beams = 1
+val_max_target_length = 128
+
+predict_with_generate = True
+
+
+# %%
+# prepare data
+# init object
+# e.g. Config
+data_config = ConfigDict(
+    dict(
+        max_length=86,
+        pad_token_id=0,
+        decoder_start_token_id=0
+    )
+)
+
+dataprep = DataPrepare(split_datasets['train'], data_config)
+# # example usage
+# %%
+seed = 117
+rng = jax.random.PRNGKey(seed)
+train_loader = dataprep.data_loader(rng, batch_size=1)
+batch = next(iter(train_loader))
+
+# %%
+batch
+
+# %%
+# model
+
+from transformers import FlaxT5ForConditionalGeneration
+from transformers import T5Config
+
+config = T5Config()
+
+# If you want don't want to cast certain parameters (for example layer norm bias and scale)
+# then pass the mask as follows
+from flax import traverse_util
+model = FlaxT5ForConditionalGeneration.from_pretrained("t5-base", _do_init=False)
+
+# useful for transformer model
+model.enable_gradient_checkpointing()
+
+# enable bf16 except for layer_norm
+flat_params = traverse_util.flatten_dict(model.params)
+mask = {
+    path: not (path[-2] == "layer_norm" and path[-1] == "weight") for path in flat_params
+}
+mask = traverse_util.unflatten_dict(mask)
+model.params = model.to_bf16(model.params, mask)
+
+# %%
+
+model, params = FlaxT5ForConditionalGeneration.from_pretrained("t5-base", _do_init=False)
+t5_module = model.module
+
+# %%
+jax.tree.map(jnp.shape, model.params)
+
+# %%
+from jax.sharding import Mesh, NamedSharding
+from jax.sharding import PartitionSpec
+from pjit_partition import set_partitions
+
+params = model.params
+data_partition_specs = PartitionSpec()
+extra_param_keys = list(model._missing_keys)
+initial_partition_specs = set_partitions(params)
+# this is the partition spec we will use
+filled_param_partition_specs = set_partitions(params, extra_keys=extra_param_keys)
+
+# %%
+# let us see the param_partition_spec
+filled_param_partition_specs
+
+# %% let us set up the mesh
+
+from jax.sharding import Mesh
+devices = np.asarray(jax.devices())
+
+# %%
+
+# mp: model/tensor parallelism
+# dp: data parallelism
+# we just use 'data' as a common axis for data and model params
+mesh_axis_names = ("data")
+print("Logical mesh:", devices)
+
+mesh = Mesh(devices, mesh_axis_names)
+
+# it is technically possible to use pjit_partition to set special partition rules
+# e.g. by param size
+# but for now just move on
+
+# %% [markdown]
+# # Model
+#
+#
+#
+
+# %%
+
+# Initialize our training
+rng = jax.random.PRNGKey(seed)
+rng, dropout_rng = jax.random.split(rng)
+
+
+# %%
+# optimization functions
+
+def create_learning_rate_fn(
+    train_ds_size: int, train_batch_size: int, num_train_epochs: int, num_warmup_steps: int, learning_rate: float
+) -> Callable[[int], jnp.ndarray]:
+    """Returns a linear warmup, linear_decay learning rate function."""
+    steps_per_epoch = train_ds_size // train_batch_size
+    num_train_steps = steps_per_epoch * num_train_epochs
+    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
+    decay_fn = optax.linear_schedule(
+        init_value=learning_rate, end_value=0, transition_steps=num_train_steps - num_warmup_steps
+    )
+    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
+    return schedule_fn
+
+
+# Create learning rate schedule
+linear_decay_lr_schedule_fn = create_learning_rate_fn(
+    training_size,
+    train_batch_size,
+    num_train_epochs,
+    warmup_steps,
+    learning_rate,
+)
+
+# We use Optax's "masking" functionality to not apply weight decay
+# to bias and LayerNorm scale parameters. decay_mask_fn returns a
+# mask boolean with the same structure as the parameters.
+# The mask is True for parameters that should be decayed.
+def decay_mask_fn(params):
+    flat_params = traverse_util.flatten_dict(params)
+    # find out all LayerNorm parameters
+    layer_norm_candidates = ["layernorm", "layer_norm", "ln"]
+    layer_norm_named_params = {
+        layer[-2:]
+        for layer_norm_name in layer_norm_candidates
+        for layer in flat_params.keys()
+        if layer_norm_name in "".join(layer).lower()
+    }
+    flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
+    return traverse_util.unflatten_dict(flat_mask)
+
+# create adam optimizer
+adamw = optax.adamw(
+    learning_rate=linear_decay_lr_schedule_fn,
+    b1=adam_beta1,
+    b2=adam_beta2,
+    eps=adam_epsilon,
+    weight_decay=weight_decay,
+    mask=decay_mask_fn,
+)
+
+
+# %%
+# Training functions
+# class TrainState(train_state.TrainState):
+#     dropout_rng: jnp.ndarray
+# 
+#     # easy way to achieve data parallelism
+#     # also achieves folding of rng keys
+#     def replicate(self):
+#         return jax_utils.replicate(self).replace(dropout_rng=shard_prng_key(self.dropout_rng))
+
+# set bf16 for model params
+# model.params = model.to_bf16(model.params)
+# Cast parameters to bfloat16 if desired
+# params = jax.tree_util.tree_map(lambda x: x.astype(jnp.bfloat16), params)
+
+# state = TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw, dropout_rng=dropout_rng)
+
+# %%
+# see if we can init the model
+
+# %%
+from transformers import FlaxT5Model, T5Config
+config = T5Config.from_pretrained('t5-base')
+model = FlaxT5Model(config, _do_init=True).module
+
+# %%
+# Initialize random key and input for initialization
+rng = jax.random.PRNGKey(0)
+train_loader = dataprep.data_loader(rng, batch_size=1)
+batch = next(iter(train_loader))
+
+# %%
+
+# Initialize model parameters
+# init of FlaxT5Module.__call__
+variables = model.init(rng,
+                       input_ids=batch['input_ids'],
+                       attention_mask=batch['attention_mask'],
+                       decoder_input_ids=batch['decoder_attention_mask'],
+                       decoder_attention_mask=batch['decoder_attention_mask']
+                       )
+params = variables['params']
+
+
+# %%
+# create an init_fn
+def init_fn(rng: jax.random.PRNGKey, batch, model) -> train_state.TrainState:
+    init_rng, rng = jax.random.split(rng)
+    variables = model.init(
+        init_rng,
+        input_ids=batch['input_ids'],
+        attention_mask=batch['attention_mask'],
+        decoder_input_ids=batch['decoder_attention_mask'],
+        decoder_attention_mask=batch['decoder_attention_mask']
+    )
+    params = variables.pop("params")
+    state = train_state.TrainState.create(
+        apply_fn=model.__call__,
+        params=params,
+        tx=adamw,
+    )
+    return state
+
+
+
+
+# %%
+# we do not know the output PartitionSpec
+# we perform the hack where we just initialize it just to find the outspec
+init_fn_try = shard_map(
+    functools.partial(init_fn, model=model),
+    mesh,
+    # 2nd argument is for the model
+    in_specs=(P(), P("data")),
+    out_specs=P(),
+    check_rep=False
+)
+
+# %%
+rng, model_init_rng = jax.random.split(rng)
+train_loader = dataprep.data_loader(model_init_rng, batch_size=batch_size)
+batch = next(iter(train_loader))
+
+
+state_fsdp_shapes = jax.eval_shape(init_fn_try, model_init_rng, batch)
+state_fsdp_specs = nn.get_partition_spec(state_fsdp_shapes)
+
+# print("RNG", state_fsdp_specs.rng)
+print("\nParameters")
+pprint(state_fsdp_specs.params)
+print("\nOptimizer state")
+pprint(state_fsdp_specs.opt_state[0])
+
+# note: state_fsdp_specs is now ready to be used as pjit outspec
+
+
+
+# %%
+# Setup train state
+# state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw, dropout_rng=dropout_rng)
+state = jax.jit(
+    init_fn,
+    in_shardings=(P(), P("data")),
+    out_shardings=state_fsdp_specs,
+)
+
+
+
+# %%
+
+# label smoothed cross entropy
+def loss_fn(logits, labels, padding_mask, label_smoothing_factor=0.0):
+    """
+    The label smoothing implementation is adapted from Flax's official example:
+    https://github.com/google/flax/blob/87a211135c6a377c8f29048a1cac3840e38b9da4/examples/wmt/train.py#L104
+    """
+    vocab_size = logits.shape[-1]
+    confidence = 1.0 - label_smoothing_factor
+    low_confidence = (1.0 - confidence) / (vocab_size - 1)
+    normalizing_constant = -(
+        confidence * jnp.log(confidence) + (vocab_size - 1) * low_confidence * jnp.log(low_confidence + 1e-20)
+    )
+    soft_labels = onehot(labels, vocab_size, on_value=confidence, off_value=low_confidence)
+
+    loss = optax.softmax_cross_entropy(logits, soft_labels)
+    loss = loss - normalizing_constant
+
+    # ignore padded tokens from loss
+    loss = loss * padding_mask
+    loss = loss.sum()
+    num_labels = padding_mask.sum()
+    return loss, num_labels
+
+# MARK: train_step
+# Define gradient update step fn
+def train_step(state, batch):
+    label_smoothing_factor=0.0
+    dropout_rng, new_dropout_rng = jax.random.split(state.dropout_rng)
+
+    def compute_loss(params):
+        labels = batch.pop("labels")
+        logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
+        loss, num_labels = loss_fn(logits, labels, batch["decoder_attention_mask"], label_smoothing_factor)
+        return loss, num_labels
+
+    # compute gradients through computational graph
+    grad_fn = jax.value_and_grad(compute_loss, has_aux=True)
+    (loss, num_labels), grad = grad_fn(state.params)
+    num_labels = jax.lax.psum(num_labels, "batch")
+
+    # true loss = total loss / total samples
+    # loss = jax.lax.psum(loss, "batch")
+    # loss = jax.tree_util.tree_map(lambda x: x / num_labels, loss)
+
+    # true grad = total grad / total samples
+    grad = jax.lax.psum(grad, "batch")
+    grad = jax.tree_util.tree_map(lambda x: x / num_labels, grad)
+    new_state = state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng)
+
+    metrics = {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}
+    return new_state, metrics
+
+# max_length = (
+#     val_max_target_length if val_max_target_length is not None else model.config.max_length
+# )
+# num_beams = num_beams if num_beams is not None else model.config.num_beams
+# gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
+
+# Create parallel version of the train and eval step
+# only state and batch
+p_train_step = jax.jit(
+    train_step,
+    # state for first, batch for second
+    in_shardings=(P("data"), P("data")),
+    out_shardings=(P("data"), P("data")),
+    donate_argnames=("state"),
+)
+
+
+
+
+# %%
+
+
+print("***** Running training *****")
+print(f"  Num examples = {training_size}")
+print(f"  Num Epochs = {num_epochs}")
+print(f"  Instantaneous batch size per device = {per_device_train_batch_size}")
+print(f"  Total train batch size (w. parallel & distributed) = {train_batch_size}")
+print(f"  Total optimization steps = {total_train_steps}")
+
+
+# %%
+# jax.profiler.start_trace("./traces")
+
+# Example batch (sharded across devices)
+sharded_batch = {
+    'input_ids': jax.device_put_sharded(batch['input_ids'], devices),
+    'attention_mask': jax.device_put_sharded(batch['attention_mask'], devices),
+    'labels': jax.device_put_sharded(batch['labels'], devices),
+    'decoder_input_ids': jax.device_put_sharded(batch['decoder_input_ids'], devices),
+    'decoder_attention_mask': jax.device_put_sharded(batch['decoder_attention_mask'], devices),
+}
+
+# Initial TrainState (pjit-ted TrainState)
+sharded_state = jax.device_put_replicated(train_state, devices)
+
+# %%
+
+
+rng, input_rng = jax.random.split(rng)
+train_time = 0
+epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+for epoch in epochs:
+    train_start = time.time()
+
+    # Create sampling rng
+    train_metrics = []
+    rng, data_rng = jax.random.split(rng)
+    train_loader = dataprep.data_loader(data_rng, batch_size=batch_size)
+    steps_per_epoch = training_size // train_batch_size
+    # Generate an epoch by shuffling sampling indices from the train dataset
+    for _ in tqdm(range(steps_per_epoch), desc="Training...", position=1, leave=False):
+        batch = next(train_loader)
+        # batch = shard(batch)
+        state, train_metric = p_train_step(state, batch)
+        train_metrics.append(train_metric)
+
+    train_time = time.time() - train_start
+
+    train_metric = unreplicate(train_metric)
+    train_metric['loss'].block_until_ready()
+
+
+
+    epochs.write(
+        # f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metric['loss']}, "
+        f"Epoch... ({epoch + 1}/{num_epochs} |  "
+        # f"Learning Rate:{train_metric['learning_rate']}, "
+        f"Last train time: {train_time})"
+    )
+# jax.profiler.stop_trace()
+# %%
+
+# output_dir = save_path
+# # save checkpoint after each epoch and push checkpoint to the hub
+# if jax.process_index() == 0:
+#     params = jax.device_get(jax.tree_util.tree_map(lambda x: x[0], state.params))
+#     params = jax.tree_util.tree_map(lambda x: x.astype(jnp.float32), params)
+#     model.save_pretrained(output_dir, params=params)
+#     tokenizer.save_pretrained(output_dir)