Step-2 API In-Depth Guide: Trillion-Parameter MoE Model in Action
Complete Step-2 API tutorial from StepFun: comprehensive coverage of text, multimodal, and function calling. Includes complete calling code for Step-2-16K text model, Step-1V vision model, and Step-Audio speech model.
What This Tutorial Solves
You will learn to use StepFun’s trillion-parameter MoE large model:
- Step-2 text API calling
- Step-1V multimodal vision
- Step-Audio speech model
- Function Calling
🎯 StepFun is one of China’s most mysterious AI companies. Step-2 is a trillion-parameter MoE model that performs exceptionally well across multiple Chinese benchmarks.
Step-2 Text API
from openai import OpenAI
import os
class Step2Client:
"""StepFun Step-2 API"""
def __init__(self):
self.client = OpenAI(
api_key=os.getenv("STEP_API_KEY"),
base_url="https://api.stepfun.com/v1",
)
def chat(
self,
prompt: str,
model: str = "step-2-16k",
system_prompt: str = None,
temperature: float = 0.7,
) -> str:
"""Basic conversation"""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
response = self.client.chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
max_tokens=2048,
)
return response.choices[0].message.content
def chat_stream(self, prompt: str, system_prompt: str = None):
"""Streaming conversation"""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
stream = self.client.chat.completions.create(
model="step-2-16k",
messages=messages,
temperature=0.7,
max_tokens=2048,
stream=True,
)
for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)
print()
def analyze_long_text(self, text: str, task: str) -> str:
"""Long-text analysis (16K context)"""
response = self.client.chat.completions.create(
model="step-2-16k",
messages=[
{
"role": "system",
"content": f"You are a text analysis expert. Task: {task}",
},
{"role": "user", "content": text[:15000]},
],
temperature=0.3,
max_tokens=4096,
)
return response.choices[0].message.content
# Usage
step = Step2Client()
# Basic conversation
reply = step.chat(
"Introduce StepFun's development history and core technologies",
system_prompt="You are an expert knowledgeable about the AI industry",
)
print(reply)
# Long-text analysis
long_text = """[Insert a long article here]"""
analysis = step.analyze_long_text(long_text, "Summarize key viewpoints and extract critical data")
print(analysis)
Step-1V Multimodal Vision
class StepVision:
"""Step-1V vision model"""
def __init__(self):
self.client = OpenAI(
api_key=os.getenv("STEP_API_KEY"),
base_url="https://api.stepfun.com/v1",
)
def analyze_image(self, image_url: str, question: str = None) -> str:
"""Analyze an image"""
response = self.client.chat.completions.create(
model="step-1v-8k",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": question or "Please describe the content of this image in detail",
},
{
"type": "image_url",
"image_url": {"url": image_url},
},
],
},
],
temperature=0.3,
max_tokens=1024,
)
return response.choices[0].message.content
def compare_images(self, image_urls: list[str], question: str) -> str:
"""Compare two images"""
content = [{"type": "text", "text": question}]
for url in image_urls:
content.append({
"type": "image_url",
"image_url": {"url": url},
})
response = self.client.chat.completions.create(
model="step-1v-8k",
messages=[{"role": "user", "content": content}],
temperature=0.3,
max_tokens=1024,
)
return response.choices[0].message.content
def ocr_image(self, image_url: str) -> str:
"""Image text recognition"""
return self.analyze_image(
image_url,
question="Extract all text from this image, preserving the original format and order",
)
# Usage
vision = StepVision()
# Image analysis
desc = vision.analyze_image(
"https://example.com/chart.png",
question="What data trends does this chart show?",
)
print(f"Chart analysis: {desc}")
Function Calling
import json
class StepFunctionCalling:
"""Step-2 Function Calling"""
def __init__(self):
self.client = OpenAI(
api_key=os.getenv("STEP_API_KEY"),
base_url="https://api.stepfun.com/v1",
)
# Define available functions
self.functions = [
{
"type": "function",
"function": {
"name": "search_database",
"description": "Search the product database",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string", "description": "Search keyword"},
"category": {"type": "string", "enum": ["Electronics", "Clothing", "Food", "Books"]},
"max_price": {"type": "number"},
"sort_by": {"type": "string", "enum": ["price", "sales", "rating"]},
},
"required": ["query"],
},
},
},
{
"type": "function",
"function": {
"name": "create_order",
"description": "Create an order",
"parameters": {
"type": "object",
"properties": {
"product_id": {"type": "string"},
"quantity": {"type": "integer", "minimum": 1},
"address": {"type": "string"},
},
"required": ["product_id", "quantity", "address"],
},
},
},
]
def agent_call(self, user_message: str, conversation_history: list = None) -> dict:
"""Agent conversation with function calling"""
messages = conversation_history or []
messages.append({"role": "user", "content": user_message})
response = self.client.chat.completions.create(
model="step-2-16k",
messages=messages,
tools=self.functions,
tool_choice="auto",
temperature=0.3,
)
msg = response.choices[0].message
# Check for function calls
if msg.tool_calls:
tool_call = msg.tool_calls[0]
function_name = tool_call.function.name
arguments = json.loads(tool_call.function.arguments)
# Execute function
result = self._execute_function(function_name, arguments)
# Return function result to model
messages.append(msg)
messages.append({
"role": "tool",
"tool_call_id": tool_call.id,
"content": result,
})
second_response = self.client.chat.completions.create(
model="step-2-16k",
messages=messages,
)
return {
"reply": second_response.choices[0].message.content,
"function_called": function_name,
"function_result": result,
}
return {"reply": msg.content, "function_called": None}
def _execute_function(self, name: str, args: dict) -> str:
"""Simulate function call execution"""
if name == "search_database":
return f"Search results: found 5 items in {args.get('category', '')}, priced below {args.get('max_price', 'no limit')} yuan"
elif name == "create_order":
return f"Order created: product {args['product_id']} x{args['quantity']}, shipping to {args['address']}"
return "Unknown operation"
# Usage
agent = StepFunctionCalling()
result = agent.agent_call("Find electronics under 500 yuan, sorted by rating")
print(f"Reply: {result['reply']}")
if result["function_called"]:
print(f"Called: {result['function_called']}")
print(f"Result: {result['function_result']}")
Step-2 vs Other Models
| Dimension | Step-2-16K | DeepSeek-V3 | Qwen-Plus | GLM-4 |
|---|---|---|---|---|
| Parameter Scale | ~1T MoE | 671B MoE | Unknown | Unknown |
| Context | 16K | 64K | 32K | 128K |
| Chinese Ability | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐ |
| Function Calling | ✅ | ✅ (Beta) | ✅ | ✅ |
| Multimodal | Step-1V | ❌ | ✅ | ✅ |
| Pricing | Medium | Very low | Medium | Medium |
Model Selection Cheat Sheet
| Scenario | Recommended Model |
|---|---|
| Conversation / Writing / Translation | step-2-16k |
| Image Understanding / OCR | step-1v-8k |
| Voice Interaction | step-audio |
| Function Calling / Agent | step-2-16k |
| Long Document Processing | step-2-16k |
Next Steps
📝 Based on StepFun Step series models, June 2026.
Advertisement