Content Moderation
Implement content safety for user inputs and AI outputs
Content Moderation
Protect your users and platform by implementing content moderation for both user inputs and AI-generated outputs.
Why Moderate?
Protect Users
Shield users from harmful, offensive, or inappropriate content
Platform Safety
Maintain community standards and brand reputation
Legal Compliance
Meet regulatory requirements (GDPR, DSA, etc.)
Reduce Abuse
Prevent misuse of your AI-powered features
Moderation Endpoint
Use the /v1/moderate endpoint:
curl https://api.assisters.dev/v1/moderate \
-H "Authorization: Bearer ask_your_api_key" \
-H "Content-Type: application/json" \
-d '{
"model": "assisters-moderation-v1",
"input": "Content to check"
}'Response:
{
"results": [{
"flagged": false,
"categories": {
"hate": false,
"harassment": false,
"self-harm": false,
"sexual": false,
"violence": false
},
"category_scores": {
"hate": 0.0001,
"harassment": 0.0002,
"self-harm": 0.0001,
"sexual": 0.0001,
"violence": 0.0001
}
}]
}Moderation Categories
| Category | Description |
|---|---|
hate | Content expressing hatred toward protected groups |
hate/threatening | Hateful content with violence threats |
harassment | Content meant to harass or bully |
harassment/threatening | Harassment with explicit threats |
self-harm | Content promoting self-harm |
self-harm/intent | Expression of self-harm intent |
self-harm/instructions | Instructions for self-harm |
sexual | Sexually explicit content |
sexual/minors | Sexual content involving minors |
violence | Content depicting violence |
violence/graphic | Graphic depictions of violence |
Implementation Patterns
1. Moderate User Inputs (Pre-moderation)
Check all user messages before processing:
from openai import OpenAI
client = OpenAI(api_key="ask_...", base_url="https://api.assisters.dev/v1")
def moderate_input(text):
result = client.moderations.create(
model="assisters-moderation-v1",
input=text
).results[0]
return not result.flagged, result
def process_message(user_message):
is_safe, moderation = moderate_input(user_message)
if not is_safe:
# Log for review
log_violation(user_message, moderation)
return "Your message violates our content policy."
# Process normally
return generate_response(user_message)2. Moderate AI Outputs (Post-moderation)
Verify AI responses before showing to users:
def safe_generate(user_message):
# Generate response
response = client.chat.completions.create(
model="assisters-chat-v1",
messages=[{"role": "user", "content": user_message}]
)
content = response.choices[0].message.content
# Moderate output
is_safe, moderation = moderate_input(content)
if not is_safe:
log_unsafe_output(content, moderation)
return "I apologize, but I cannot provide that response."
return content3. Bi-directional Moderation
Check both inputs AND outputs:
def fully_moderated_chat(user_message):
# 1. Check input
input_safe, input_mod = moderate_input(user_message)
if not input_safe:
return "Your message violates our content policy."
# 2. Generate response
response = generate_response(user_message)
# 3. Check output
output_safe, output_mod = moderate_input(response)
if not output_safe:
# Try regenerating or return safe fallback
return "I cannot provide a response to that."
return response4. Custom Thresholds
Use category_scores for fine-grained control:
def custom_moderation(text, thresholds=None):
if thresholds is None:
thresholds = {
"hate": 0.5,
"harassment": 0.5,
"violence": 0.3, # Stricter on violence
"sexual": 0.7, # More lenient
}
result = client.moderations.create(
model="assisters-moderation-v1",
input=text
).results[0]
violations = []
for category, threshold in thresholds.items():
score = result.category_scores.get(category, 0)
if score > threshold:
violations.append({
"category": category,
"score": score,
"threshold": threshold
})
return len(violations) == 0, violations5. Batch Moderation
Moderate multiple items efficiently:
def moderate_batch(texts):
result = client.moderations.create(
model="assisters-moderation-v1",
input=texts # Up to 100 items
)
return [
{"text": text, "flagged": r.flagged, "categories": r.categories}
for text, r in zip(texts, result.results)
]
# Example: Moderate comments
comments = ["comment 1", "comment 2", "comment 3"]
results = moderate_batch(comments)
for r in results:
if r["flagged"]:
print(f"Flagged: {r['text'][:50]}...")Handling Violations
1. Block and Notify
def handle_violation(user_id, content, moderation):
# Block the content
blocked = True
# Notify user
notification = f"Your message was blocked: {get_violation_reason(moderation)}"
# Log for review
log_to_moderation_queue(user_id, content, moderation)
return blocked, notification2. Review Queue
from enum import Enum
class ModerationAction(Enum):
APPROVE = "approve"
REJECT = "reject"
ESCALATE = "escalate"
def add_to_review_queue(content, moderation, priority="normal"):
# High scores = higher priority
max_score = max(moderation.category_scores.values())
if max_score > 0.8:
priority = "high"
review_item = {
"content": content,
"categories": moderation.categories,
"scores": moderation.category_scores,
"priority": priority,
"status": "pending"
}
save_to_queue(review_item)3. User Warnings
def handle_user_violation(user_id, violation_type):
# Get violation history
history = get_user_violations(user_id)
if len(history) == 0:
# First violation: warn
return send_warning(user_id)
elif len(history) < 3:
# Repeat violations: temporary restriction
return restrict_user(user_id, hours=24)
else:
# Multiple violations: escalate
return escalate_to_admin(user_id)Best Practices
Moderate Both Directions
Check user inputs AND AI outputs
Use Custom Thresholds
Tune sensitivity based on your use case
Log Everything
Keep audit trails for compliance
Human Review
Have humans review borderline cases
Complete Example
from openai import OpenAI
from dataclasses import dataclass
from typing import Optional
@dataclass
class ModerationResult:
is_safe: bool
reason: Optional[str]
scores: dict
class ContentModerator:
def __init__(self, api_key: str, thresholds: dict = None):
self.client = OpenAI(
api_key=api_key,
base_url="https://api.assisters.dev/v1"
)
self.thresholds = thresholds or {
"hate": 0.5,
"harassment": 0.5,
"self-harm": 0.3,
"sexual": 0.5,
"violence": 0.5,
}
def check(self, text: str) -> ModerationResult:
result = self.client.moderations.create(
model="assisters-moderation-v1",
input=text
).results[0]
# Check against custom thresholds
for category, threshold in self.thresholds.items():
score = getattr(result.category_scores, category, 0)
if score > threshold:
return ModerationResult(
is_safe=False,
reason=f"Content flagged for {category} (score: {score:.2f})",
scores=result.category_scores.__dict__
)
return ModerationResult(
is_safe=True,
reason=None,
scores=result.category_scores.__dict__
)
def safe_chat(self, user_message: str) -> str:
# Check input
input_check = self.check(user_message)
if not input_check.is_safe:
return f"Message blocked: {input_check.reason}"
# Generate response
response = self.client.chat.completions.create(
model="assisters-chat-v1",
messages=[{"role": "user", "content": user_message}]
)
content = response.choices[0].message.content
# Check output
output_check = self.check(content)
if not output_check.is_safe:
return "I cannot provide an appropriate response."
return content
# Usage
moderator = ContentModerator(api_key="ask_...")
response = moderator.safe_chat("Hello, how are you?")
print(response)Pricing
| Model | Price per Million Tokens |
|---|---|
assisters-moderation-v1 | $0.05 |
View Moderation Models
Compare moderation model capabilities