Routing setup with permaURLs and content negotiation

1. Caddy version (caddy version):

v2.4.6 h1:HGkGICFGvyrodcqOOclHKfvJC0qTU7vny/7FhYp9hNw=

2. How I run Caddy:

a. System environment:

Ubuntu 20.04

b. Command:

sudo systemctl start caddy.service

c. Service/unit/compose file:

[Unit]
Description=Caddy HTTP/2 web server
Documentation=https://caddyserver.com/docs
After=network-online.target
Wants=network-online.target systemd-networkd-wait-online.service

; Do not allow the process to be restarted in a tight loop. If the
; process fails to start, something critical needs to be fixed.
StartLimitIntervalSec=86400
StartLimitBurst=10

[Service]
Restart=on-failure
RestartSec=1080

; User and group the process will run as.
User=www-data
Group=podman

ExecStart=/usr/local/bin/caddy run --config /etc/caddy/caddy_config.json --resume
; ExecReload=/usr/local/bin/caddy reload --config /etc/caddy/caddy_config.json
ExecStop=/usr/local/bin/caddy stop

; Limit the number of file descriptors; see `man systemd.exec` for more limit settings.
LimitNOFILE=1048576

; Use graceful shutdown with a reasonable timeout
KillMode=mixed
KillSignal=SIGQUIT
TimeoutStopSec=5s

; Use private /tmp and /var/tmp, which are discarded after caddy stops.
PrivateTmp=true
; Use a minimal /dev (May bring additional security if switched to 'true', but it may not work on Raspberry Pi's or other devices)
PrivateDevices=true
; Hide /home, /root, and /run/user. Nobody will steal your SSH-keys.
ProtectHome=false
; Make /usr, /boot, /etc and possibly some more folders read-only.
ProtectSystem=full

; The following additional security directives only work with systemd v229 or later.
; They further restrict privileges that can be gained by caddy.
; Note that you may have to add capabilities required by any plugins in use.
CapabilityBoundingSet=CAP_NET_BIND_SERVICE
AmbientCapabilities=CAP_NET_BIND_SERVICE
NoNewPrivileges=true

[Install]
WantedBy=multi-user.target

d. My complete Caddyfile or JSON config:

{
	"logging": {
		"logs": {
			"default": {
				"writer": {
					"filename": "/var/log/caddy/access.log",
					"output": "file"
				},
				"encoder": {
					"format": "console"
				},
				"level": "DEBUG"
			}
		}
	},
	"apps": {
		"http": {
			"servers": {
				"srv0": {
					"listen": [
						":443"
					],
					"routes": [
						{
							"match": [
								{
									"host": [
										"search.test.salamanca.school"
									]
								}
							],
							"handle": [
								{
									"handler": "subroute",
									"routes": [
										{
											"handle": [
												{
													"handler": "vars",
													"root": "/opt/opensphinxsearch/public"
												},
												{
													"handler": "headers",
													"response": {
														"set": {
															"Access-Control-Allow-Headers": [
																"*"
															],
															"Access-Control-Allow-Methods": [
																"*"
															],
															"Access-Control-Allow-Origin": [
																"*"
															]
														}
													}
												},
												{
													"encodings": {
														"gzip": {},
														"zstd": {}
													},
													"handler": "encode",
													"prefer": [
														"zstd",
														"gzip"
													]
												},
												{
													"handler": "push"
												}
											]
										},
										{
											"handle": [
												{
													"handler": "static_response",
													"headers": {
														"Location": [
															"{http.request.uri.path}/"
														]
													},
													"status_code": 308
												}
											],
											"match": [
												{
													"file": {
														"try_files": [
															"{http.request.uri.path}/index.php"
														]
													},
													"not": [
														{
															"path": [
																"*/"
															]
														}
													]
												}
											]
										},
										{
											"handle": [
												{
													"handler": "rewrite",
													"uri": "{http.matchers.file.relative}"
												}
											],
											"match": [
												{
													"file": {
														"split_path": [
															".php"
														],
														"try_files": [
															"{http.request.uri.path}",
															"{http.request.uri.path}/index.php",
															"index.php"
														]
													}
												}
											]
										},
										{
											"handle": [
												{
													"handler": "reverse_proxy",
													"transport": {
														"protocol": "fastcgi",
														"split_path": [
															".php"
														]
													},
													"upstreams": [
														{
															"dial": "unix//run/php/php-fpm.sock"
														}
													]
												}
											],
											"match": [
												{
													"path": [
														"*.php"
													]
												}
											]
										},
										{
											"handle": [
												{
													"handler": "file_server",
													"hide": [
														"./Caddyfile"
													]
												}
											]
										}
									]
								}
							],
							"terminal": true
						},
						{
							"match": [
								{
									"host": [
										"c104-129.cloud.gwdg.de",
										"test.salamanca.school",
										"api.test.salamanca.school",
										"www.test.salamanca.school"
									]
								}
							],
							"handle": [
								{
									"handler": "subroute",
									"routes": [
										{
											"handle": [
												{
													"handler": "vars",
													"root": "/var/data/caddy/site"
												},
												{
													"handler": "headers",
													"response": {
														"set": {
															"Access-Control-Allow-Headers": [
																"*"
															],
															"Access-Control-Allow-Methods": [
																"*"
															],
															"Access-Control-Allow-Origin": [
																"*"
															]
														}
													}
												}
											]
										},
										{
											"group": "group2",
											"handle": [
												{
													"handler": "rewrite",
													"uri": "/sal/"
												}
											],
											"match": [
												{
													"path": [
														"/sal"
													]
												}
											]
										},
										{
											"handle": [
												{
													"handler": "rewrite",
													"uri": "{http.matchers.file.relative}"
												}
											],
											"match": [
												{
													"file": {
														"try_files": [
															"{http.request.uri.path}.html",
															"{http.request.uri.path}"
														]
													}
												}
											]
										},
										{
											"handle": [
												{
													"encodings": {
														"gzip": {},
														"zstd": {}
													},
													"handler": "encode",
													"prefer": [
														"zstd",
														"gzip"
													]
												},
												{
													"handler": "push"
												},
												{
													"handler": "templates"
												},
												{
													"handler": "templates"
												}
											]
										},
										{
											"handle": [
												{
													"handler": "subroute",
													"routes": [
														{
															"handle": [
																{
																	"handler": "rewrite",
																	"strip_path_prefix": "/sal"
																}
															]
														},
														{
															"group": "group0",
															"handle": [
																{
																	"handler": "rewrite",
																	"uri": "/exist/apps/salamanca/{http.request.uri.path}"
																}
															]
														},
														{
															"handle": [
																{
																	"handler": "reverse_proxy",
																	"headers": {
																		"response": {
																			"set": {
																				"Access-Control-Allow-Headers": [
																					"*"
																				],
																				"Access-Control-Allow-Methods": [
																					"*"
																				],
																				"Access-Control-Allow-Origin": [
																					"*"
																				]
																			}
																		}
																	},
																	"upstreams": [
																		{
																			"dial": "localhost:8080"
																		}
																	]
																}
															]
														}
													]
												}
											],
											"match": [
												{
													"path": [
														"/sal/*"
													]
												}
											]
										},
										{
											"handle": [
												{
													"handler": "file_server",
													"hide": [
														"./Caddyfile"
													]
												}
											]
										}
									]
								}
							],
							"terminal": true
						},
						{
							"match": [
								{
									"host": [
										"id.test.salamanca.school"
									]
								}
							],
							"handle": [
								{
									"handler": "subroute",
									"routes": [
										{
											"handle": [
												{
													"@id": "routing_map",
													"destinations": [
														"{myfile}",
														"{online}"
													],
													"handler": "map",
													"source": "{http.request.uri.path}"
												},
												{
													"handler": "headers",
													"response": {
														"set": {
															"Access-Control-Allow-Headers": [
																"*"
															],
															"Access-Control-Allow-Methods": [
																"*"
															],
															"Access-Control-Allow-Origin": [
																"*"
															]
														}
													}
												}
											]
										},
										{
											"handle": [
												{
													"handler": "static_response",
													"headers": {
														"Location": [
															"https://www.test.salamanca.school/data/{myfile}"
														]
													},
													"status_code": 302
												}
											],
											"match": [
												{
													"path": [
														"/texts/*"
													]
												}
											]
										}
									]
								}
							],
							"terminal": true
						}
					],
					"automatic_https": {
						"disable_redirects": true
					}
				}
			}
		},
		"tls": {
			"automation": {
				"policies": [
					{
						"subjects": [
							"api.test.salamanca.school",
							"c104-129.cloud.gwdg.de",
							"test.salamanca.school",
							"www.test.salamanca.school",
							"search.test.salamanca.school",
							"id.test.salamanca.school"
						],
						"issuers": [
							{
								"email": "bla@xy.com",
								"module": "acme"
							},
							{
								"email": "bla@xy.com",
								"module": "zerossl"
							}
						]
					}
				]
			}
		}
	}
}

The crucial parts have been generated from a Caddyfile that looked like this:

www.test.salamanca.school {
	# Serve everything else from the file system
	root * /var/data/caddy/site
	file_server

	# Make HTML file extension optional
	try_files {path}.html {path}
}
id.test.salamanca.school {
	map {path} {mydest} {online} {
		#   PATH                 FILE                                                               ONLINE
		# ------------------------------------------------------------------------------------------------------
		# /texts/W0004:frontmatter W0004/00001_W0004-00-0005-fm-03e8.html#W0004-00-0005-fm-03e8       yes
	}
	redir /texts/* https://www.test.salamanca.school/data/{mydest}
}

(I have added an id to the map so to be able to adress it directly in caddy’s API.)

and then when data is added, entries are added to the map by POSTing things like this:

curl -X POST -H "Content-Type: application/json" -d @patch_mapping.json "http://localhost:2019/id/routing_map/mappings/..."

with dynamically created maps such as:

[ {
  "input" : "/texts/W0095",
  "outputs" : [ "W0095/html/00001_W0095-00-0001-fm-03e8#W0095-00-0001-fm-03e8", "yes" ]
}, {
  "input" : "/texts/W0095?format=html",
  "outputs" : [ "W0095/html/00001_W0095-00-0001-fm-03e8#W0095-00-0001-fm-03e8", "yes" ]
}, {
  "input" : "/texts/W0095?format=iiif",
  "outputs" : [ "W0095/W0095.json", "yes" ]
}, {
  "input" : "/texts/W0095?format=rdf",
  "outputs" : [ "W0095/W0095.rdf", "yes" ]
}, {
  "input" : "/texts/W0095?format=pdf",
  "outputs" : [ "W0095/W0095.pdf", "yes" ]
}, {
  "input" : "/texts/W0095:frontmatter",
  "outputs" : [ "W0095/html/00001_W0095-00-0001-fm-03e8#W0095-00-0001-fm-03e8", "yes" ]
}, {
  "input" : "/texts/W0095:frontmatter.titlepage",
  "outputs" : [ "W0095/html/00001_W0095-00-0001-fm-03e8#W0095-00-0001-tp-03e8", "yes" ]
}, {
  "input" : "/texts/W0095:frontmatter.pI",
  "outputs" : [ "W0095/html/00001_W0095-00-0001-fm-03e8#pageNo_W0095-00-0001-pb-03e8", "yes" ]
}, {
  "input" : "/texts/W0095:frontmatter.1",
  "outputs" : [ "W0095/html/00001_W0095-00-0001-fm-03e8#W0095-00-0002-d1-03e8", "yes" ]
}, {
  "input" : "/texts/W0095:frontmatter.1.heading",
  "outputs" : [ "W0095/html/00001_W0095-00-0001-fm-03e8#W0095-00-0002-he-03e8", "yes" ]
}, {
  "input" : "/texts/W0095:frontmatter.pII",
  "outputs" : [ "W0095/html/00001_W0095-00-0001-fm-03e8#pageNo_W0095-00-0002-pb-03e9", "yes" ]
}, {
  "input" : "/texts/W0095:frontmatter.1.1",
  "outputs" : [ "W0095/html/00001_W0095-00-0001-fm-03e8#W0095-00-0002-pa-03eb", "yes" ]
}, {
  "input" : "/texts/W0095:1",
  "outputs" : [ "W0095/html/00003_W0095-00-0003-d1-03e9#W0095-00-0003-d1-03e9", "yes" ]
}, {
  "input" : "/texts/W0095:1.heading",
  "outputs" : [ "W0095/html/00003_W0095-00-0003-d1-03e9#W0095-00-0003-he-03e9", "yes" ]
}]

(FWIW, I am not even sure any more why I was having the second ("yes") value in the “outputs” array. I think it was so that I would be able to handle resources that are not yet “fully” online, e.g. images yes but no fulltext html…)

3. The problem I’m having:

Generally, I am trying to re-implement a system that you can see live at https://www.salamanca.school, because the current system has some serious performance issues. One of the measures I am trying to apply is taking the html shipping and the routing away from the (eXist-db/xQuery) backend and handle them in caddy. More concretely, this question/post is about the routing part of it. (I have described some of the logic of the current system in a blogpost back in 2016.)

As you can see in the config, I am trying to do several things, and I’ll try to keep them apart in what follows. And I hope it is okay to have them in this one posting, because they are all related in one way or another to the map/redir routing mechanism outlined above …

a. Routing permaURLs to actual html files

Most basically, I want to offer permaURLs for documents and document sections like https://id.test.salamanca.school/texts/W0095:1 (for the first chapter of the document called W0095). Since the works are quite large, they are segmented into different html files and the map keeps track of which section is in which html document, translating the permaURL to an html file residing in the filesystem (adding a fragment identifier if necessary).

So far, this seems to work fine, but first of all, I’d like to make sure I’ll not run into problems when the map contains hundreds of thousands of entries. Currently I have 19,000 entries, and I have only uploaded some small documents. I would not be surprised if I end up with more than a million entries. I could imagine setting up a KVS like redis or something if it becomes too big for caddy to handle as a map. But maybe it will just be fine…?

b. Bypassing content negotiation with an explicit format URL query parameter

Many of the resources can be represented in several ways: html, plaintext, pdf, images and even rdf/xml. I want to set up a content negotiation mechanism (see next question), but I want to be able to bypass this by passing a format=XY query parameter in the url. So the url https://id.test.salamanca.school/texts/W0095?format=iiif would take me straight to the (json) iiif Manifest for the whole work (that’s the 3rd entry in the example map posted above).

However, the query parameter, while mentioned in the map’s input fields seems to not be evaluated by my setup. I guess that’s because in my config, the redir directive has only a simple path matcher whereas for this, I’d need a query matcher. Do I need a second redir directive with a query matcher and a second map of its own to take care of the format URL query parameters or can/should I do this differently? (Obviously, as for the simple case described in (a.), there will also be lots of these redirections.)

c. Content negotiation

Finally, if no format is specified in the request URL, I’d like to do redirect differently based on the request’s Accept header.

Any information I could find about “caddy content negotiation” was concerned with the encoding (in fact, compression) of one and the same resource (1, 2, 3), but this is about redirecting to different resources (as described e.g. in the W3C’s Cool URIs for the Semantic Web). When I did this in the “old system” I have re-implemented some golang content negotiation code from httputil in xQuery.

But I must say that now, I don’t quite know where to start with this: Should I try to work with a header_regexp matcher? Can this handle the case when the client specifies several formats in the Accept header, distinguished by q parameters? And how can I combine this with the routing mechanism described above? Do I need to switch cases based on the header_regexp and then have one of those maps per data format?

I am sorry that this is such a convoluted posting, but I really have difficulties in keeping the different aspects isolated here.
I would be really thankful for any help!

Best,
Andreas

(PS. I don’t have any error message, and above I have described what I have done so far, so I skip the remaining points of the posting template, I hope this is okay.)

1 Like

That’s cause you’re using {path} for your map, but that doesn’t include the query part of the URL. You’d need to use {uri} to include the query. The {uri} is basically the same as {path}?{query} (except ? may be absent if there’s no query).

My understanding is that browsers won’t automatically fill Accept in such a way that this would be useful. Browsers only have a small handful of default Accept header values they will use, and those depend on how the content is being loaded (like, which HTML tag is trying to source the content). See here:

So I guess it depends what kind of clients you’re using here. Are your users using software (not a browser) that would specify an Accept header in particular?

But yeah – Caddy doesn’t currently have facilities for doing content negotiation except for compression right now, and regexp is unlikely to be sufficient because order matters, and q= params add a wrench into things.

You could write a Caddy plugin to do that content negotiation logic – Since what you quoted was written in Go, that shouldn’t be too hard, since Caddy (and plugins) are written in Go.

Have you considered restructuring those maps to something more like a tree? You could then much more quickly access the right “output” quickly.

If I were to do it, I’d write a quick PHP script to spit out the redirect needed. AFAICT, all your inputs are requests like /texts/*, so you could use a path matcher on /texts/* to invoke php_fastcgi which would hit your index.php script which could perform the necessary logic then respond with the redirect (i.e. write the Location header).

But I’d suggest a JSON tree something like this, to speed up lookups:

{
    "texts": {
        "W0095": {
            "formats": {
                "html": "/html/00001_W0095-00-0001-fm-03e8#W0095-00-0001-fm-03e8",
                "iiif": "/W0095.json",
                "rdf": "/W0095.rdf",
                "pdf": "/W0095.pdf"
            },
            "chapters": {
                "frontmatter": "/html/00001_W0095-00-0001-fm-03e8#W0095-00-0001-fm-03e8",
                "frontmatter.titlepage": "/html/00001_W0095-00-0001-fm-03e8#pageNo_W0095-00-0001-pb-03e8",
                "1": "/html/00003_W0095-00-0003-d1-03e9#W0095-00-0003-d1-03e9"
            }
        },
        "W0096": {

        }
    }
}

And then it’s a pretty simple algorithm to find the right path:

  • Grab the “text” name from the URL (first segment after /texts/ ending in the first : if present)
  • Do a lookup in the map by text
  • Check the ?format= if set – if it’s a known format, just redirect straight to that URL (i.e. text_name + format_lookup as your final URL)
  • Check the Accepts header to try to guess the format (if not HTML), then lookup by known format.
  • Parse the chapter name by grabbing the part followed by : in the path, do a lookup and redirect to it if set
  • If not found but you do have a text name then I guess redirect to the html format by default :man_shrugging:

This should be very fast because you’re performing hashmap lookups in O(1) time, instead of scanning through a list of rules which could be O(n) to compare against each one. Especially if you’re quoting a possible million URLs, this would easily be the most efficient way to go about it. And you could do this in any programming language you like (but like I said my personal go-to would be PHP because it’s easy to write and test quickly and has no build/compile time).

2 Likes

Thank you a lot! I will try the {uri} matcher, but think about the rest of your comments and suggestions more carefully so I am sure I understand what you’re saying. (In other words, I apologize for not immediately responding more substantially. Will surely do so later.)

This topic was automatically closed after 30 days. New replies are no longer allowed.