# makes a request to a local ollama server and streams the response import requests import sys import time import json import argparse OLLAMA_SERVER = "10.48.9.106" OLLAMA_PORT = 11434 SYSTEM_PROMPTS = [ "Always answer in a CLI-friendly, plain text format. Use Markdown code blocks for code, and ensure code is easy to copy and paste. Do not use colors or special formatting that breaks copy-paste. Only use plain Markdown for code.", ] HIST_CONTEXT = [] def ping_server(): url = f"http://{OLLAMA_SERVER}:{OLLAMA_PORT}" try: response = requests.get(url) if response.status_code == 200: print("Ollama server is reachable.") return True else: print(f"Failed to reach Ollama server: {response.status_code}") return False except requests.exceptions.RequestException as e: print(f"Error connecting to Ollama server: {e}") return False def stream_ollama_response(model, prompt, output_format="cli"): url = f"http://{OLLAMA_SERVER}:{OLLAMA_PORT}/api/chat" headers = { "Content-Type": "application/json", } data = { "model": model, "messages": [], "stream": True } for msg in SYSTEM_PROMPTS: data["messages"].append({"system": msg}) for hist in HIST_CONTEXT: if hist["prompt"] and hist["response"]: data["messages"].append({"role": "user", "content": hist["prompt"]}) if hist["thought"]: data["messages"].append({"role": "assistant", "content": hist["thought"]}) data["messages"].append({"role": "assistant", "content": hist["response"]}) data["messages"].append({"role": "user", "content": prompt}) response = requests.post(url, headers=headers, json=data, stream=True) if response.status_code != 200: print(f"Error: {response.status_code} - {response.text}") return "" print("Thinking (plain text, code blocks are copy-paste ready):") in_thinking = True thoughts = "" response_text = "" message = {} for line in response.iter_lines(): if line: decoded_line = line.decode('utf-8') if decoded_line.startswith("data: "): decoded_line = decoded_line[6:] if decoded_line == "[DONE]": break try: chunk = json.loads(decoded_line) if 'message' in chunk: message = chunk['message'] # Stream 'thinking' tokens live if 'thinking' in message and message['thinking']: sys.stdout.write(message['thinking']) sys.stdout.flush() thoughts += message['thinking'] # When content starts, print a newline and stream content live if 'content' in message and message['content']: if in_thinking: print("\n\nResponse (copy-paste code blocks as needed):") in_thinking = False sys.stdout.write(message['content']) sys.stdout.flush() response_text += message['content'] except json.JSONDecodeError: continue print() # for newline after completion # add to history context HIST_CONTEXT.append({ "response_time": time.strftime("%Y-%m-%d %H:%M:%S"), "prompt": prompt, "thought": thoughts, "response": response_text }) return response_text if __name__ == "__main__": parser = argparse.ArgumentParser(description="Stream response from local Ollama server.") parser.add_argument("model", type=str, help="The model to use (e.g., 'llama2', 'gpt4o').") args = parser.parse_args() if not args.model: print("Model is required.") sys.exit(1) if not ping_server(): sys.exit(1) conversation_start_time = time.strftime("%Y%m%d-%H%M%S") output_filename = f"conversation_{conversation_start_time}.txt" print(f"\nEnter your prompt below. Type 'q', 'quit', 'done', or 'bye' to end the conversation.\nAll responses will be saved to {output_filename}.\n") with open(output_filename, "w", encoding="utf-8") as f: f.write(f"Conversation started at {conversation_start_time}\nModel: {args.model}\n\n") while True: try: prompt = input("You: ").strip() except (EOFError, KeyboardInterrupt): print("\nConversation ended.") break if prompt.lower() in {"q", "quit", "done", "bye"}: print("Conversation ended.") break if not prompt: continue f.write(f"You: {prompt}\n") response_text = stream_ollama_response(args.model, prompt, output_format="cli") f.write(f"AI: {response_text}\n\n")