# single timestamp at the end means no speech after the last timestamp. last_timestamp_position = timestamps[-1].item() - tokenizer.timestamp_begin duration = last_timestamp_position * time_precision add_segment( start=timestamp_offset, end=timestamp_offset + duration, text_tokens=tokens...