################################################################################
#
# FINANCIAL MORTGAGES PORTFOLIO ANALYTICS
#
#
# STRATEGY: Allow all major crawlers to maximize discoverability in:
#   - ChatGPT, SearchGPT, OAI-SearchBot
#   - Claude, Claude Web, Claude Search
#   - Perplexity AI
#   - Google Gemini, Google-Extended
#   - Apple Siri, Apple Intelligence
#   - Meta AI, Facebook
#   - DuckDuckGo AI
#   - Microsoft Copilot, Bing
#   - ByteDance/TikTok AI
#   - Common Crawl (publicly available LLM training)
#
# VERIFIED SOURCES:
#   - OpenAI: platform.openai.com/docs/bots
#   - Anthropic: support.claude.com
#   - Perplexity: docs.perplexity.ai/guides/bots
#   - Google: developers.google.com/crawling
#   - Apple: support.apple.com/en-us/120320
#   - Meta: developers.facebook.com/docs/sharing/webmasters
#   - Cloudflare: 2025 verified bot traffic analysis
#   - Dark Visitors: verified crawler tracking
#   - Search Engine Journal: December 2025 crawler report
#
################################################################################

################################################################################
# SECTION 1: UNIVERSAL ALLOWANCE FOR ALL CRAWLERS
# Default policy: Allow everyone (override with specific rules below)
################################################################################

User-agent: *
Allow: /

################################################################################
# SECTION 2: TRADITIONAL SEARCH ENGINE CRAWLERS
# Essential for traditional SEO and foundational AI visibility (Gemini depends
# on Google Search ranking)
################################################################################

# GOOGLE SEARCH CRAWLERS (Traditional SEO)
# These crawl for Google Search indexing
User-agent: Googlebot
Allow: /

User-agent: Googlebot-Image
Allow: /

User-agent: Googlebot-Video
Allow: /

User-agent: Googlebot-News
Allow: /

User-agent: Googlebot-Mobile
Allow: /

User-agent: GoogleOther
Allow: /

# BING SEARCH CRAWLERS
# Bingbot indexes for Bing Search, which partially feeds ChatGPT search
User-agent: Bingbot
Allow: /

User-agent: BingPreview
Allow: /

User-agent: Bingbot-Mobile
Allow: /

User-agent: MsnBot
Allow: /

################################################################################
# SECTION 3: GOOGLE AI CRAWLERS (Gemini, Vertex AI, Google AI Overviews)
# Controls Google's AI systems including Gemini integration
################################################################################

# GOOGLE-EXTENDED: Control token for Gemini AI training
# NOTE: This is a robots.txt token ONLY - does not have separate HTTP user-agent
# Actual crawling uses standard Googlebot, but you control via this token
User-agent: Google-Extended
Allow: /

# GOOGLE CLOUD VERTEX BOT: For Vertex AI Agent building
User-agent: Google-CloudVertexBot
Allow: /

################################################################################
# SECTION 4: OPENAI CRAWLERS - CRITICAL FOR CHATGPT VISIBILITY
# Three separate crawlers with distinct purposes and compliance behaviors
################################################################################

# OAI-SEARCHBOT: Indexes content for ChatGPT Search results
# - PURPOSE: Real-time search indexing (NOT training)
# - RESPECTS ROBOTS.TXT: YES
# - USER-AGENT STRING: OAI-SearchBot/1.0; +https://openai.com/searchbot
# - IMPORTANCE: CRITICAL for being cited in ChatGPT search responses
User-agent: OAI-SearchBot
Allow: /

# GPTBOT: Training data collection for ChatGPT and GPT model training
# - PURPOSE: Foundation model training for future versions
# - RESPECTS ROBOTS.TXT: YES
# - USER-AGENT STRING: compatible; GPTBot/1.1; +https://openai.com/gptbot
# - IMPORTANCE: Affects inclusion in future training (optional to allow)
# - NOTE: Traffic increased 305% from May 2024 to May 2025
User-agent: GPTBot
Allow: /

# CHATGPT-USER: User-initiated content requests within ChatGPT
# - PURPOSE: Fetch pages when ChatGPT users request them
# - RESPECTS ROBOTS.TXT: NO (as of December 2025 update)
# - USER-AGENT STRING: compatible; ChatGPT-User/1.0; +https://openai.com/bot
# - IMPORTANCE: CRITICAL - real users accessing your content
# - NOTE: Does NOT respect robots.txt; cannot be blocked via this file
User-agent: ChatGPT-User
Allow: /

################################################################################
# SECTION 5: ANTHROPIC (CLAUDE) CRAWLERS - CRITICAL FOR CLAUDE VISIBILITY
# Three separate crawlers enabling financial document analysis and search
################################################################################

# CLAUDEBOT: Citation fetching and training for Claude
# - PURPOSE: Real-time citation fetching for Claude responses
# - RESPECTS ROBOTS.TXT: YES
# - USER-AGENT STRING: ClaudeBot
# - IMPORTANCE: CRITICAL for being cited in Claude responses
User-agent: ClaudeBot
Allow: /

# CLAUDE-WEB: Web-focused Claude crawler
# - PURPOSE: Web content indexing for Claude search
# - RESPECTS ROBOTS.TXT: YES
# - USER-AGENT STRING: Claude-Web
# - IMPORTANCE: Essential for Claude web search visibility
User-agent: Claude-Web
Allow: /

# CLAUDE-SEARCHBOT: Claude search engine indexing
# - PURPOSE: Dedicated search functionality optimization
# - RESPECTS ROBOTS.TXT: YES
# - USER-AGENT STRING: Claude-SearchBot
User-agent: Claude-SearchBot
Allow: /

# CLAUDE-USER: User-initiated requests within Claude
# - PURPOSE: Fetch content when Claude users request it
# - RESPECTS ROBOTS.TXT: NO
# - IMPORTANCE: Real users accessing your financial analytics
User-agent: Claude-User
Allow: /

# ANTHROPIC-AI: Anthropic bulk training crawler
# - PURPOSE: Model training and improvement
# - RESPECTS ROBOTS.TXT: YES
# - USER-AGENT STRING: anthropic-ai
User-agent: anthropic-ai
Allow: /

################################################################################
# SECTION 6: PERPLEXITY AI CRAWLERS - CRITICAL FOR PERPLEXITY VISIBILITY
# Perplexity operates independent infrastructure (NOT dependent on Google/Bing)
################################################################################

# PERPLEXITYBOT: Perplexity AI search indexing
# - PURPOSE: Building Perplexity's independent search index
# - RESPECTS ROBOTS.TXT: YES
# - USER-AGENT STRING: PerplexityBot/1.0; +https://perplexity.ai/perplexitybot
# - IMPORTANCE: CRITICAL - Perplexity is major AI search platform
# - NOTE: Perplexity processes 100M+ queries monthly
User-agent: PerplexityBot
Allow: /

# PERPLEXITY-USER: User-initiated requests within Perplexity
# - PURPOSE: Fetch content when users query Perplexity
# - RESPECTS ROBOTS.TXT: NO
# - USER-AGENT STRING: Perplexity-User/1.0; +https://perplexity.ai/perplexity-user
# - IMPORTANCE: Real users accessing your content in Perplexity AI
User-agent: Perplexity-User
Allow: /

################################################################################
# SECTION 7: APPLE CRAWLERS (Siri, Spotlight, Apple Intelligence)
# Powers Apple ecosystem search and AI features
################################################################################

# APPLEBOT: Core Apple crawler for Siri, Spotlight, Safari Search
# - PURPOSE: Indexing for Siri, Spotlight, Safari search
# - RESPECTS ROBOTS.TXT: YES
# - USER-AGENT STRING: (Applebot/0.1; +http://www.apple.com/go/applebot)
# - IMPORTANCE: Growing importance as Apple Intelligence expands
User-agent: Applebot
Allow: /

# APPLEBOT-EXTENDED: Control token for Apple Intelligence training
# - PURPOSE: Apple Intelligence model training control
# - RESPECTS ROBOTS.TXT: Does not actually crawl; only affects training
# - USER-AGENT STRING: Applebot-Extended
# - IMPORTANCE: Controls whether content used for Apple AI training
User-agent: Applebot-Extended
Allow: /

################################################################################
# SECTION 8: META/FACEBOOK CRAWLERS
# Meta's crawler for AI model training and product improvement
################################################################################

# META-EXTERNALAGENT: Meta's primary AI crawler
# - PURPOSE: Content collection for AI model training and product improvement
# - RESPECTS ROBOTS.TXT: YES
# - USER-AGENT STRING: meta-externalagent/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)
# - IMPORTANCE: Meta processes significant content volume
# - NOTE: Meta crawler controversial for high volume but legitimate
User-agent: meta-externalagent
Allow: /

# META-EXTERNALAGENT (Alternative naming)
User-agent: Meta-ExternalAgent
Allow: /

# META-EXTERNALFETCHER: User-initiated requests from Meta AI
User-agent: Meta-ExternalFetcher
Allow: /

################################################################################
# SECTION 9: DUCKDUCKGO CRAWLERS
# DuckDuckGo search and AI-assisted answers
################################################################################

# DUCKDUCKBOT: DuckDuckGo search crawling
# - PURPOSE: DuckDuckGo search engine indexing
# - RESPECTS ROBOTS.TXT: YES
# - USER-AGENT STRING: DuckDuckBot/1.1; (+http://duckduckgo.com/duckduckbot.html)
# - IMPORTANCE: Privacy-focused search users prefer DuckDuckGo
User-agent: DuckDuckBot
Allow: /

# DUCKASSISTBOT: DuckDuckGo AI-assisted answers
# - PURPOSE: AI-powered search answers on DuckDuckGo
# - RESPECTS ROBOTS.TXT: YES
# - USER-AGENT STRING: (+https://duckduckgo.com/duckassistbot.html)
# - IMPORTANCE: DuckDuckGo's AI response engine
User-agent: DuckAssistBot
Allow: /

################################################################################
# SECTION 10: AMAZON/AWS CRAWLERS
# Alexa and Amazon AI systems
################################################################################

# AMAZONBOT: Amazon crawler for Alexa and AI services
# - PURPOSE: Alexa knowledge base and Amazon AI training
# - RESPECTS ROBOTS.TXT: YES
# - USER-AGENT STRING: Amazonbot/0.1
User-agent: Amazonbot
Allow: /

################################################################################
# SECTION 11: BYTEDANCE/TIKTOK CRAWLERS
# Emerging AI crawler for TikTok and ByteDance AI systems
################################################################################

# BYTESPIDER: ByteDance crawler for TikTok and AI systems
# - PURPOSE: Content indexing for TikTok and AI model training
# - RESPECTS ROBOTS.TXT: YES
# - USER-AGENT STRING: Bytespider/1.0
# - IMPORTANCE: Emerging but growing AI platform
User-agent: Bytespider
Allow: /

################################################################################
# SECTION 12: SPECIALIZED AI/LLM TRAINING CRAWLERS
# Used by research institutions and AI companies
################################################################################

# CCBOT: Common Crawl crawler
# - PURPOSE: Building public internet archive for LLM training
# - RESPECTS ROBOTS.TXT: YES
# - USER-AGENT STRING: CCBot/2.0
# - IMPORTANCE: CRITICAL - Used by 100+ different LLM training projects
# - NOTE: If allowed, your content may be in Common Crawl used by many LLMs
User-agent: CCBot
Allow: /

# COHERE-AI: Cohere LLM crawler
# - PURPOSE: Training data for Cohere LLMs
# - RESPECTS ROBOTS.TXT: YES
# - USER-AGENT STRING: cohere-ai
User-agent: cohere-ai
Allow: /

# YOU.COM: You.com AI search crawler
# - PURPOSE: AI-powered search engine indexing
# - RESPECTS ROBOTS.TXT: YES
# - USER-AGENT STRING: YouBot/1.0
User-agent: YouBot
Allow: /

################################################################################
# SECTION 13: CRAWL DELAY SETTINGS
# Prevents server overload by limiting crawl frequency
# Values in seconds - delay between requests from same bot
################################################################################

# Default crawl delay for most bots: 1 second between requests
User-agent: *
Crawl-delay: 1
Request-rate: 1/1s

# More aggressive crawl for major search engines (optimized infrastructure)
User-agent: Googlebot
Crawl-delay: 0.5
Request-rate: 2/1s

User-agent: Googlebot-News
Crawl-delay: 0.5
Request-rate: 2/1s

User-agent: Bingbot
Crawl-delay: 1
Request-rate: 1/1s

# Slightly slower for AI training crawlers (respect their resource limits)
User-agent: GPTBot
Crawl-delay: 2
Request-rate: 1/2s

User-agent: ClaudeBot
Crawl-delay: 2
Request-rate: 1/2s

User-agent: PerplexityBot
Crawl-delay: 1
Request-rate: 1/1s

User-agent: Google-Extended
Crawl-delay: 1
Request-rate: 1/1s

User-agent: meta-externalagent
Crawl-delay: 1
Request-rate: 1/1s

User-agent: CCBot
Crawl-delay: 2
Request-rate: 1/2s

################################################################################
# SECTION 14: SITEMAP REFERENCES
# Helps ALL crawlers discover and prioritize content efficiently
################################################################################

# Primary sitemap for all crawlers
Sitemap: https://incolintelligence.com/sitemap.xml

# AI-specific sitemaps (optional but recommended)
Sitemap: https://incolintelligence.com/sitemap_ai.xml
Sitemap: https://incolintelligence.com/sitemap_mortgage_analytics.xml
Sitemap: https://incolintelligence.com/sitemap_regulatory.xml
Sitemap: https://incolintelligence.com/sitemap_collateral_risk.xml

################################################################################
# SECTION 15: OPTIONAL - SELECTIVE ACCESS CONTROL
# UNCOMMENT ONLY IF YOU NEED GRANULAR RESTRICTIONS
# Use these patterns if you want to protect specific paths while allowing
# crawlers to access public content
################################################################################

# OPTION A: Protect admin/internal paths while allowing crawlers to index public content
# (COMMENTED OUT - Uncomment if needed)
#
# User-agent: *
# Disallow: /admin/
# Disallow: /internal/
# Disallow: /private/
# Disallow: /api/internal/
# Disallow: /customer-data/
# Disallow: /secure/
# Allow: /public/
# Allow: /documentation/
# Allow: /research/
# Allow: /blog/
# Allow: /api/public/

# OPTION B: Block training crawlers but allow search crawlers
# (This prevents training inclusion but maintains AI search visibility)
# (COMMENTED OUT - Uncomment if needed)
#
# # Block training crawlers completely
# User-agent: GPTBot
# Disallow: /
#
# User-agent: ClaudeBot
# Disallow: /
#
# User-agent: anthropic-ai
# Disallow: /
#
# User-agent: Google-Extended
# Disallow: /
#
# User-agent: Applebot-Extended
# Disallow: /
#
# User-agent: meta-externalagent
# Disallow: /
#
# User-agent: CCBot
# Disallow: /
#
# User-agent: cohere-ai
# Disallow: /
#
# User-agent: Bytespider
# Disallow: /
#
# # Allow search crawlers for visibility in ChatGPT search, Claude search, etc.
# User-agent: OAI-SearchBot
# Allow: /
#
# User-agent: Claude-Web
# Allow: /
#
# User-agent: Claude-SearchBot
# Allow: /
#
# User-agent: PerplexityBot
# Allow: /
#
# User-agent: DuckAssistBot
# Allow: /
#
# User-agent: Googlebot
# Allow: /
#
# User-agent: Bingbot
# Allow: /

# OPTION C: Restrict certain sections while allowing general AI indexing
# (COMMENTED OUT - Uncomment if needed)
#
# # Restrict admin and internal API
# User-agent: *
# Disallow: /admin/
# Disallow: /internal/
# Disallow: /api/v1/private/
#
# # But allow crawlers to access public documentation and blog
# User-agent: *
# Allow: /api/v1/public/
# Allow: /documentation/
# Allow: /blog/
# Allow: /research/
# Allow: /case-studies/


################################################################################
# END OF ROBOTS.TXT
# 
# DISCLAIMER
# This configuration follows robots.txt standard (RFC 9309) and respects
# all official crawler documentation as of January 2026. Not all bots respect
# robots.txt (some user-initiated actions bypass), but this represents
# best practice for maximum visibility across all legitimate AI platforms.
#
################################################################################