mistral-rate-limits

Compare original and translation side by side

🇺🇸

Original

English

🇨🇳

Translation

Chinese

Mistral AI Rate Limits

Overview

概述

Handle Mistral AI rate limits gracefully with exponential backoff and request management.

借助指数退避和请求管理机制，优雅处理Mistral AI的限流问题。

Prerequisites

前提条件

Mistral AI SDK installed
Understanding of async/await patterns
Access to rate limit headers

已安装Mistral AI SDK
了解async/await模式
可访问限流相关响应头

Instructions

操作步骤

Step 1: Understand Rate Limit Tiers

步骤1：了解限流层级

Tier	Requests/min	Tokens/min	Tokens/month
Free	2	500K	1B
Production	120	1M	10B
Enterprise	Custom	Custom	Custom

Note: Limits vary by model and are subject to change. Check console.mistral.ai for current limits.

层级	每分钟请求数	每分钟Token数	每月Token数
免费版	2	500K	1B
生产版	120	1M	10B
企业版	自定义	自定义	自定义

注意： 限流限制因模型而异，且可能随时变更。请访问console.mistral.ai查看当前限制。

Step 2: Implement Exponential Backoff with Jitter

步骤2：实现带抖动的指数退避机制

typescript

interface RetryConfig {
  maxRetries: number;
  baseDelayMs: number;
  maxDelayMs: number;
  jitterMs: number;
}

async function withExponentialBackoff<T>(
  operation: () => Promise<T>,
  config: RetryConfig = {
    maxRetries: 5,
    baseDelayMs: 1000,
    maxDelayMs: 60000,
    jitterMs: 500
  }
): Promise<T> {
  for (let attempt = 0; attempt <= config.maxRetries; attempt++) {
    try {
      return await operation();
    } catch (error: any) {
      if (attempt === config.maxRetries) throw error;

      const status = error.status;

      // Only retry on rate limits (429) or server errors (5xx)
      if (status !== 429 && (status < 500 || status >= 600)) throw error;

      // Check for Retry-After header
      const retryAfter = error.headers?.['retry-after'];
      let delay: number;

      if (retryAfter) {
        delay = parseInt(retryAfter) * 1000;
      } else {
        // Exponential delay with jitter to prevent thundering herd
        const exponentialDelay = config.baseDelayMs * Math.pow(2, attempt);
        const jitter = Math.random() * config.jitterMs;
        delay = Math.min(exponentialDelay + jitter, config.maxDelayMs);
      }

      console.log(`Attempt ${attempt + 1} failed (${status}). Retrying in ${delay.toFixed(0)}ms...`);
      await new Promise(r => setTimeout(r, delay));
    }
  }
  throw new Error('Unreachable');
}

// Usage
const response = await withExponentialBackoff(() =>
  client.chat.complete({
    model: 'mistral-small-latest',
    messages: [{ role: 'user', content: 'Hello!' }],
  })
);

typescript

interface RetryConfig {
  maxRetries: number;
  baseDelayMs: number;
  maxDelayMs: number;
  jitterMs: number;
}

async function withExponentialBackoff<T>(
  operation: () => Promise<T>,
  config: RetryConfig = {
    maxRetries: 5,
    baseDelayMs: 1000,
    maxDelayMs: 60000,
    jitterMs: 500
  }
): Promise<T> {
  for (let attempt = 0; attempt <= config.maxRetries; attempt++) {
    try {
      return await operation();
    } catch (error: any) {
      if (attempt === config.maxRetries) throw error;

      const status = error.status;

      // 仅在限流（429）或服务器错误（5xx）时重试
      if (status !== 429 && (status < 500 || status >= 600)) throw error;

      // 检查Retry-After响应头
      const retryAfter = error.headers?.['retry-after'];
      let delay: number;

      if (retryAfter) {
        delay = parseInt(retryAfter) * 1000;
      } else {
        // 带抖动的指数延迟，防止惊群效应
        const exponentialDelay = config.baseDelayMs * Math.pow(2, attempt);
        const jitter = Math.random() * config.jitterMs;
        delay = Math.min(exponentialDelay + jitter, config.maxDelayMs);
      }

      console.log(`Attempt ${attempt + 1} failed (${status}). Retrying in ${delay.toFixed(0)}ms...`);
      await new Promise(r => setTimeout(r, delay));
    }
  }
  throw new Error('Unreachable');
}

// 使用示例
const response = await withExponentialBackoff(() =>
  client.chat.complete({
    model: 'mistral-small-latest',
    messages: [{ role: 'user', content: 'Hello!' }],
  })
);

Step 3: Token-Based Rate Limiting

步骤3：基于Token的限流

typescript

class TokenRateLimiter {
  private tokensUsed = 0;
  private windowStart = Date.now();
  private readonly tokensPerMinute: number;
  private readonly windowMs = 60000; // 1 minute

  constructor(tokensPerMinute = 500000) {
    this.tokensPerMinute = tokensPerMinute;
  }

  async waitForCapacity(estimatedTokens: number): Promise<void> {
    const now = Date.now();
    const elapsed = now - this.windowStart;

    // Reset window if needed
    if (elapsed >= this.windowMs) {
      this.tokensUsed = 0;
      this.windowStart = now;
    }

    // Check if we need to wait
    if (this.tokensUsed + estimatedTokens > this.tokensPerMinute) {
      const waitTime = this.windowMs - elapsed;
      console.log(`Token limit approaching. Waiting ${waitTime}ms...`);
      await new Promise(r => setTimeout(r, waitTime));
      this.tokensUsed = 0;
      this.windowStart = Date.now();
    }
  }

  recordUsage(tokensUsed: number): void {
    this.tokensUsed += tokensUsed;
  }
}

// Usage
const rateLimiter = new TokenRateLimiter(500000);

async function rateLimitedChat(messages: Message[]): Promise<string> {
  // Estimate tokens (rough: 4 chars per token)
  const estimatedTokens = JSON.stringify(messages).length / 4;

  await rateLimiter.waitForCapacity(estimatedTokens + 500); // +500 for response

  const response = await client.chat.complete({
    model: 'mistral-small-latest',
    messages,
  });

  if (response.usage) {
    rateLimiter.recordUsage(response.usage.totalTokens || 0);
  }

  return response.choices?.[0]?.message?.content ?? '';
}

typescript

class TokenRateLimiter {
  private tokensUsed = 0;
  private windowStart = Date.now();
  private readonly tokensPerMinute: number;
  private readonly windowMs = 60000; // 1分钟

  constructor(tokensPerMinute = 500000) {
    this.tokensPerMinute = tokensPerMinute;
  }

  async waitForCapacity(estimatedTokens: number): Promise<void> {
    const now = Date.now();
    const elapsed = now - this.windowStart;

    // 必要时重置时间窗口
    if (elapsed >= this.windowMs) {
      this.tokensUsed = 0;
      this.windowStart = now;
    }

    // 检查是否需要等待
    if (this.tokensUsed + estimatedTokens > this.tokensPerMinute) {
      const waitTime = this.windowMs - elapsed;
      console.log(`Token额度即将耗尽。等待${waitTime}ms...`);
      await new Promise(r => setTimeout(r, waitTime));
      this.tokensUsed = 0;
      this.windowStart = Date.now();
    }
  }

  recordUsage(tokensUsed: number): void {
    this.tokensUsed += tokensUsed;
  }
}

// 使用示例
const rateLimiter = new TokenRateLimiter(500000);

async function rateLimitedChat(messages: Message[]): Promise<string> {
  // 预估Token数（大致：每4个字符对应1个Token）
  const estimatedTokens = JSON.stringify(messages).length / 4;

  await rateLimiter.waitForCapacity(estimatedTokens + 500); // +500用于响应内容

  const response = await client.chat.complete({
    model: 'mistral-small-latest',
    messages,
  });

  if (response.usage) {
    rateLimiter.recordUsage(response.usage.totalTokens || 0);
  }

  return response.choices?.[0]?.message?.content ?? '';
}

Step 4: Request Queue with Concurrency Control

步骤4：带并发控制的请求队列

typescript

import PQueue from 'p-queue';

const requestQueue = new PQueue({
  concurrency: 5,        // Max concurrent requests
  interval: 1000,        // 1 second interval
  intervalCap: 10,       // Max 10 requests per interval
});

async function queuedRequest<T>(operation: () => Promise<T>): Promise<T> {
  return requestQueue.add(async () => {
    return withExponentialBackoff(operation);
  });
}

// Usage
const results = await Promise.all(
  prompts.map(prompt =>
    queuedRequest(() =>
      client.chat.complete({
        model: 'mistral-small-latest',
        messages: [{ role: 'user', content: prompt }],
      })
    )
  )
);

typescript

import PQueue from 'p-queue';

const requestQueue = new PQueue({
  concurrency: 5,        // 最大并发请求数
  interval: 1000,        // 时间间隔：1秒
  intervalCap: 10,       // 每个时间间隔内的最大请求数
});

async function queuedRequest<T>(operation: () => Promise<T>): Promise<T> {
  return requestQueue.add(async () => {
    return withExponentialBackoff(operation);
  });
}

// 使用示例
const results = await Promise.all(
  prompts.map(prompt =>
    queuedRequest(() =>
      client.chat.complete({
        model: 'mistral-small-latest',
        messages: [{ role: 'user', content: prompt }],
      })
    )
  )
);

Step 5: Rate Limit Monitor

步骤5：限流监控器

typescript

class RateLimitMonitor {
  private requestCount = 0;
  private lastReset = Date.now();
  private readonly alertThreshold: number;

  constructor(alertThreshold = 0.8) {
    this.alertThreshold = alertThreshold;
  }

  recordRequest(): void {
    const now = Date.now();
    if (now - this.lastReset >= 60000) {
      this.requestCount = 0;
      this.lastReset = now;
    }
    this.requestCount++;
  }

  checkThreshold(maxRequests: number): void {
    if (this.requestCount / maxRequests > this.alertThreshold) {
      console.warn(`Rate limit warning: ${this.requestCount}/${maxRequests} requests used`);
    }
  }

  getStats(): { requestCount: number; windowRemaining: number } {
    return {
      requestCount: this.requestCount,
      windowRemaining: 60000 - (Date.now() - this.lastReset),
    };
  }
}

typescript

class RateLimitMonitor {
  private requestCount = 0;
  private lastReset = Date.now();
  private readonly alertThreshold: number;

  constructor(alertThreshold = 0.8) {
    this.alertThreshold = alertThreshold;
  }

  recordRequest(): void {
    const now = Date.now();
    if (now - this.lastReset >= 60000) {
      this.requestCount = 0;
      this.lastReset = now;
    }
    this.requestCount++;
  }

  checkThreshold(maxRequests: number): void {
    if (this.requestCount / maxRequests > this.alertThreshold) {
      console.warn(`限流警告：已使用${this.requestCount}/${maxRequests}次请求`);
    }
  }

  getStats(): { requestCount: number; windowRemaining: number } {
    return {
      requestCount: this.requestCount,
      windowRemaining: 60000 - (Date.now() - this.lastReset),
    };
  }
}

Output

输出结果

Reliable API calls with automatic retry
Token-based rate limiting
Request queue with concurrency control
Rate limit monitoring

具备自动重试机制的可靠API调用
基于Token的限流
带并发控制的请求队列
限流监控

Error Handling

错误处理

Header	Description	Action
Retry-After	Seconds to wait	Honor this value
X-RateLimit-Limit	Max requests	Monitor usage
X-RateLimit-Remaining	Remaining requests	Throttle if low
X-RateLimit-Reset	Reset timestamp	Wait until reset

响应头	描述	处理动作
Retry-After	需要等待的秒数	遵循该值等待
X-RateLimit-Limit	最大请求数	监控使用情况
X-RateLimit-Remaining	剩余请求数	不足时进行限流
X-RateLimit-Reset	重置时间戳	等待至重置时间后再请求

Examples

示例

Python Rate Limiting

Python限流实现

python

import time
import asyncio
from mistralai import Mistral

async def with_retry(client, max_retries=3):
    for attempt in range(max_retries):
        try:
            return await client.chat.complete(
                model="mistral-small-latest",
                messages=[{"role": "user", "content": "Hello"}]
            )
        except Exception as e:
            if hasattr(e, 'status') and e.status == 429:
                delay = (2 ** attempt) + (random.random() * 0.5)
                print(f"Rate limited. Waiting {delay:.1f}s...")
                await asyncio.sleep(delay)
            else:
                raise
    raise Exception("Max retries exceeded")

python

import time
import asyncio
from mistralai import Mistral

async def with_retry(client, max_retries=3):
    for attempt in range(max_retries):
        try:
            return await client.chat.complete(
                model="mistral-small-latest",
                messages=[{"role": "user", "content": "Hello"}]
            )
        except Exception as e:
            if hasattr(e, 'status') and e.status == 429:
                delay = (2 ** attempt) + (random.random() * 0.5)
                print(f"Rate limited. Waiting {delay:.1f}s...")
                await asyncio.sleep(delay)
            else:
                raise
    raise Exception("Max retries exceeded")

Batch Processing with Rate Limiting

带限流的批量处理

typescript

async function processBatch<T, R>(
  items: T[],
  processor: (item: T) => Promise<R>,
  batchSize = 5,
  delayMs = 1000
): Promise<R[]> {
  const results: R[] = [];

  for (let i = 0; i < items.length; i += batchSize) {
    const batch = items.slice(i, i + batchSize);

    const batchResults = await Promise.all(
      batch.map(item => withExponentialBackoff(() => processor(item)))
    );

    results.push(...batchResults);

    // Delay between batches
    if (i + batchSize < items.length) {
      await new Promise(r => setTimeout(r, delayMs));
    }
  }

  return results;
}

typescript

async function processBatch<T, R>(
  items: T[],
  processor: (item: T) => Promise<R>,
  batchSize = 5,
  delayMs = 1000
): Promise<R[]> {
  const results: R[] = [];

  for (let i = 0; i < items.length; i += batchSize) {
    const batch = items.slice(i, i + batchSize);

    const batchResults = await Promise.all(
      batch.map(item => withExponentialBackoff(() => processor(item)))
    );

    results.push(...batchResults);

    // 批次间延迟
    if (i + batchSize < items.length) {
      await new Promise(r => setTimeout(r, delayMs));
    }
  }

  return results;
}

Resources

参考资源

Next Steps

下一步

For security configuration, see

mistral-security-basics

如需安全配置相关内容，请查看

mistral-security-basics

。