diff --git a/misc/llama-cpp/Makefile b/misc/llama-cpp/Makefile
index fafd959c3860..37c41e840c80 100644
--- a/misc/llama-cpp/Makefile
+++ b/misc/llama-cpp/Makefile
@@ -1,64 +1,75 @@
 PORTNAME=	llama-cpp
 DISTVERSIONPREFIX=	b
 DISTVERSION=	5054
+PORTREVISION=	1
 CATEGORIES=	misc # machine-learning
 
 MAINTAINER=	yuri@FreeBSD.org
 COMMENT=	Facebook's LLaMA model in C/C++ # '
 WWW=		https://github.com/ggerganov/llama.cpp
 
 LICENSE=	MIT
 LICENSE_FILE=	${WRKSRC}/LICENSE
 
 BROKEN_armv7=	clang crashes, see https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=278810
 BROKEN_i386=	compilation fails, see https://github.com/ggerganov/llama.cpp/issues/9545
 
 USES=		cmake:testing compiler:c++11-lang python:run shebangfix
 USE_LDCONFIG=	yes
 
 USE_GITHUB=	yes
 GH_ACCOUNT=	ggerganov
 GH_PROJECT=	llama.cpp
 GH_TUPLE=	nomic-ai:kompute:4565194:kompute/kompute
 
 SHEBANG_GLOB=	*.py
 
 CMAKE_ON=	BUILD_SHARED_LIBS
 CMAKE_OFF=	LLAMA_BUILD_TESTS
 CMAKE_TESTING_ON=	LLAMA_BUILD_TESTS
 
+# user for llama-server, only used when EXAMPLES=ON
+USER=		nobody
+SUB_LIST=	USER=${USER}
+
 OPTIONS_DEFINE=		CURL EXAMPLES VULKAN
 OPTIONS_DEFAULT=	CURL VULKAN
 OPTIONS_SUB=		yes
 
 CURL_DESCR=		Use libcurl to download model from an URL
 CURL_CMAKE_BOOL=	LLAMA_CURL
 CURL_USES=		localbase
 CURL_LIB_DEPENDS=	libcurl.so:ftp/curl
 
 EXAMPLES_CMAKE_BOOL=	LLAMA_BUILD_EXAMPLES
 
 VULKAN_DESC=		Vulkan GPU offload support
 VULKAN_CMAKE_BOOL=	GGML_VULKAN
 VULKAN_BUILD_DEPENDS=	glslc:graphics/shaderc \
 			vulkan-headers>0:graphics/vulkan-headers
 VULKAN_LIB_DEPENDS=	libvulkan.so:graphics/vulkan-loader
 
 BINARY_ALIAS=	git=false \
 		python=${PYTHON_CMD} # for tests
 
 do-test-ci: # build of tests fails, see https://github.com/ggerganov/llama.cpp/issues/10955
 	@cd ${WRKSRC} && \
 		${SETENV} ${MAKE_ENV} bash ci/run.sh ./tmp/results ./tmp/mnt
 
+.include <bsd.port.options.mk>
+
+.if ${PORT_OPTIONS:MEXAMPLES}
+USE_RC_SUBR=	llama-server
+.endif
+
 # tests as of 4458: 97% tests passed, 1 tests failed out of 31, see https://github.com/ggerganov/llama.cpp/issues/11036
 
 # tests as of 4649:
 # 88% tests passed, 4 tests failed out of 32
 # The following tests FAILED:
 #         18 - test-chat (Subprocess aborted)                    main   # see https://github.com/ggerganov/llama.cpp/issues/11705
 #         24 - test-gguf (SEGFAULT)                              main
 #         25 - test-backend-ops (SEGFAULT)                       main
 #         32 - test-eval-callback (SEGFAULT)                     curl eval-callback
 
 .include <bsd.port.mk>
diff --git a/misc/llama-cpp/files/llama-server.in b/misc/llama-cpp/files/llama-server.in
new file mode 100644
index 000000000000..d3e564ee488c
--- /dev/null
+++ b/misc/llama-cpp/files/llama-server.in
@@ -0,0 +1,66 @@
+#!/bin/sh
+
+#
+# PROVIDE: llama_server
+# REQUIRE: LOGIN
+# KEYWORD: shutdown
+
+# Add the following lines to /etc/rc.conf to enable llama_server
+# llama_server_enable="YES"
+#
+# llama_server_enable (bool): 	Set to YES to enable llama_server
+#				Default: NO
+# llama_server_user (str):	llama_server daemon user
+#				Default: %%USER%%
+# llama_server_model (str):	AI model that llama-server will use
+#				Default: "" (required)
+# llama_server_args (str):	Additional arguments for llama-server
+#				Default: "" (optional)
+# llama_server_log (str):	Log file that llama-server will write log to
+#				Default: "/var/log/llama-server.log" (optional)
+# llama_server_pidfile (str):	Pidfile file that llama-server's pid will be written to
+#				Default: "" (optional)
+
+. /etc/rc.subr
+
+name="llama_server"
+rcvar=llama_server_enable
+load_rc_config $name
+
+: ${llama_server_enable:="NO"}
+: ${llama_server_user:="%%USER%%"}
+: ${llama_server_model:=""}
+: ${llama_server_args:=""}
+: ${llama_server_log:="/var/log/llama-server.log"}
+: ${llama_server_pidfile:="/var/run/${name}.pid"}
+
+run_command="%%PREFIX%%/bin/llama-server"
+procname="${run_command}"
+pidfile=${llama_server_pidfile}
+command=/usr/sbin/daemon
+command_args="-f -t ${name} -p ${pidfile} -o ${llama_server_log} ${run_command} -m ${llama_server_model} ${llama_server_args} --keep -1"
+start_precmd="llama_server_precmd"
+llama_server_chdir=/tmp
+
+llama_server_precmd()
+{
+	# check model
+	if [ -z "${llama_server_model}" ]; then
+		echo "llama_server_model isn't set, it is required"
+		exit 1
+	fi
+	if [ ! -f "${llama_server_model}" ]; then
+		echo "llama_server_model isn't a file"
+		exit 1
+	fi
+
+	# initialize pifile
+	#install -o ${llama_server_user} /dev/null ${llama_server_pidfile}
+
+	# ensure that the log file exists and has right permissions
+	touch ${llama_server_log}
+	chown ${llama_server_user} ${llama_server_log}
+	chmod 640 ${llama_server_log}
+}
+
+run_rc_command "$1"
diff --git a/misc/llama-cpp/pkg-message b/misc/llama-cpp/pkg-message
index 071e82665d9a..157a4db6ea78 100644
--- a/misc/llama-cpp/pkg-message
+++ b/misc/llama-cpp/pkg-message
@@ -1,17 +1,27 @@
 [
 { type: install
   message: <<EOM
 You installed LLaMA-cpp: Facebook's LLaMA model runner.
 
 In order to experience LLaMA-cpp please download some
 AI model in the GGUF format, for example from huggingface.com,
 run the script below, and open localhost:9011 in your browser
 to communicate with this AI model.
 
 $ llama-server -m $MODEL \
   --host 0.0.0.0 \
-  --port 9011
+  --port 9011 \
+  -ngl 15
+
+or
+
+you can add the following lines to /etc/rc.conf,
+start the llama-server service,
+and navigate to http://localhost:8080:
+> llama_server_enable=YES
+> llama_server_model=/path/to/models/llama-2-7b-chat.Q4_K_M.gguf
+> llama_server_args="--device Vulkan0 -ngl 27"
 
 EOM
 }
 ]