# makes a request to a local ollama server and streams the response
import requests
import sys
import time
import json
import argparse

OLLAMA_SERVER = "10.48.9.106"
OLLAMA_PORT = 11434
SYSTEM_PROMPTS = [
    "Always answer in a CLI-friendly, plain text format. Use Markdown code blocks for code, and ensure code is easy to copy and paste. Do not use colors or special formatting that breaks copy-paste. Only use plain Markdown for code.",
]
HIST_CONTEXT = []

def ping_server():
    url = f"http://{OLLAMA_SERVER}:{OLLAMA_PORT}"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            print("Ollama server is reachable.")
            return True
        else:
            print(f"Failed to reach Ollama server: {response.status_code}")
            return False
    except requests.exceptions.RequestException as e:
        print(f"Error connecting to Ollama server: {e}")
        return False

def stream_ollama_response(model, prompt, output_format="cli"):
    url = f"http://{OLLAMA_SERVER}:{OLLAMA_PORT}/api/chat"
    headers = {
        "Content-Type": "application/json",
    }
    data = {
        "model": model,
        "messages": [],
        "stream": True
    }

    for msg in SYSTEM_PROMPTS:
        data["messages"].append({"system": msg})

    for hist in HIST_CONTEXT:
        if hist["prompt"] and hist["response"]:
            data["messages"].append({"role": "user", "content": hist["prompt"]})
            if hist["thought"]:
                data["messages"].append({"role": "assistant", "content": hist["thought"]})
            data["messages"].append({"role": "assistant", "content": hist["response"]})

    data["messages"].append({"role": "user", "content": prompt})

    response = requests.post(url, headers=headers, json=data, stream=True)
    if response.status_code != 200:
        print(f"Error: {response.status_code} - {response.text}")
        return ""

    print("Thinking (plain text, code blocks are copy-paste ready):")
    in_thinking = True
    thoughts = ""
    response_text = ""
    message = {}

    for line in response.iter_lines():
        if line:
            decoded_line = line.decode('utf-8')
            if decoded_line.startswith("data: "):
                decoded_line = decoded_line[6:]
            if decoded_line == "[DONE]":
                break
            try:
                chunk = json.loads(decoded_line)
                if 'message' in chunk:
                    message = chunk['message']
                    # Stream 'thinking' tokens live
                    if 'thinking' in message and message['thinking']:
                        sys.stdout.write(message['thinking'])
                        sys.stdout.flush()
                        thoughts += message['thinking']
                    # When content starts, print a newline and stream content live
                    if 'content' in message and message['content']:
                        if in_thinking:
                            print("\n\nResponse (copy-paste code blocks as needed):")
                            in_thinking = False
                        sys.stdout.write(message['content'])
                        sys.stdout.flush()
                        response_text += message['content']
            except json.JSONDecodeError:
                continue
    print()  # for newline after completion

    # add to history context
    HIST_CONTEXT.append({
        "response_time": time.strftime("%Y-%m-%d %H:%M:%S"),
        "prompt": prompt,
        "thought": thoughts,
        "response": response_text
    })
    return response_text

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Stream response from local Ollama server.")
    parser.add_argument("model", type=str, help="The model to use (e.g., 'llama2', 'gpt4o').")
    args = parser.parse_args()

    if not args.model:
        print("Model is required.")
        sys.exit(1)

    if not ping_server():
        sys.exit(1)

    conversation_start_time = time.strftime("%Y%m%d-%H%M%S")
    output_filename = f"conversation_{conversation_start_time}.txt"
    print(f"\nEnter your prompt below. Type 'q', 'quit', 'done', or 'bye' to end the conversation.\nAll responses will be saved to {output_filename}.\n")

    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(f"Conversation started at {conversation_start_time}\nModel: {args.model}\n\n")
        while True:
            try:
                prompt = input("You: ").strip()
            except (EOFError, KeyboardInterrupt):
                print("\nConversation ended.")
                break
            if prompt.lower() in {"q", "quit", "done", "bye"}:
                print("Conversation ended.")
                break
            if not prompt:
                continue
            f.write(f"You: {prompt}\n")
            response_text = stream_ollama_response(args.model, prompt, output_format="cli")
            f.write(f"AI: {response_text}\n\n")