CogStack · adam-sutton-1992 · Apr 24, 2026 · Apr 24, 2026 · Apr 26, 2026 · Apr 26, 2026
diff --git a/medcat-plugins/embedding-linker/src/medcat_embedding_linker/config.py b/medcat-plugins/embedding-linker/src/medcat_embedding_linker/config.py
@@ -70,13 +70,46 @@ class EmbeddingLinking(Linking):
     """Choose a device for the linking model to be stored. If None
     then an appropriate GPU device that is available will be chosen"""
     context_window_size: int = 14
-    """Choose the window size to get context vectors."""
+    """Choose the window size to get context vectors. In a trained model 
+    if you increase the context window after training then performance will
+    degrade significantly."""
     use_ner_link_candidates: bool = True
     """Link candidates are provided by some NER steps. This will flag if 
-    you want to trust them or not."""
+    you want to trust them or not. A good guideline is if you've trained 
+    on data from the same distribution then this is probably best set to True.
+    If you have no training data from the same source distribution then it MIGHT
+    be better set to false."""
+    append_to_ner_link_candidates: bool = False
+    """If `use_ner_link_candidates` is enabled, generate additional
+    candidates and append them to existing NER candidates instead of only
+    generating for entities that have none. This will often result in a slight
+    increase in recall, and precision."""
+    use_pre_inference: bool = True
+    """Whether to use the pre-inference step to filter candidates before
+    calculating similarities. This can speed up inference by only calculating
+    similarities for candidates that are likely to be correct based direct on word 
+    matching."""
     learning_rate: float = 1e-4
     """Learning rate for training the embedding linker. Only used if 
     the embedding linker is trainable."""
     weight_decay: float = 0.01
     """Weight decay for training the embedding linker. Only used if
     the embedding linker is trainable."""
+    multiple_predictions_per_detected_entity: bool = False
+    """Whether to allow multiple predictions per detected entity. If False, only 
+    the highest scoring candidate will be returned for each entity. If True, all 
+    candidates that exceed the similarity thresholds will be returned. This can be 
+    useful if you want to return multiple CUIs for an entity, but can also lead to 
+    more false positives."""
+    pre_inference_top_k_sampling: int = 1
+    """When using pre-inference to filter candidates, how many names to then add
+    their related CUIs as potential candidates. Higher numbers will increase recall 
+    but also increase inference time, and reduce precision. This is influenced by 
+    `short_similarity_threshold`, i.e. pass the top k samples over the threshold 
+    for inference."""
+    inference_top_k_sampling: int = 1
+    """At the inference step, after calculating similarity scores, how many candidates 
+    to keep for each entity. Higher numbers will increase recall but also increase 
+    inference time, and often reduce precision. This is influenced by 
+    `long_similarity_threshold`, i.e. take the top k samples over the threshold. This 
+    will be ignored if `multiple_predictions_per_detected_entity` is set to False."""
diff --git a/medcat-plugins/embedding-linker/src/medcat_embedding_linker/embedding_linker.py b/medcat-plugins/embedding-linker/src/medcat_embedding_linker/embedding_linker.py
diff --git a/medcat-plugins/embedding-linker/src/medcat_embedding_linker/trainable_embedding_linker.py b/medcat-plugins/embedding-linker/src/medcat_embedding_linker/trainable_embedding_linker.py
@@ -2,6 +2,7 @@
 from medcat_embedding_linker.config import EmbeddingLinking
 from torch import Tensor
 from medcat.cdb import CDB
+from medcat.components.types import TrainableComponent
 from medcat.config.config import Config, ComponentConfig
 from medcat.components.linking.vector_context_model import PerDocumentTokenCache
 from medcat.tokenizing.tokenizers import BaseTokenizer
@@ -17,7 +18,7 @@
 logger = logging.getLogger(__name__)
 
 
-class Linker(StaticEmbeddingLinker, AbstractManualSerialisable):
+class Linker(StaticEmbeddingLinker, AbstractManualSerialisable, TrainableComponent):
     """Trainable variant of the embedding linker.
     This class inherits inference and embedding behavior from Linker and provides
     method hooks for online/offline training.
@@ -28,7 +29,10 @@ class Linker(StaticEmbeddingLinker, AbstractManualSerialisable):
     _MODEL_FOLDER_NAME = "trainable_embedding_model"
     _MODEL_STATE_FILE_NAME = "model_state.pt"
 
-    def __init__(self, cdb: CDB, config: Config) -> None:
+    def __init__(self, 
+                 cdb: CDB, 
+                 config: Config,
+                 tokenizer: BaseTokenizer) -> None:
         if not isinstance(config.components.linking, EmbeddingLinking):
             raise TypeError("Linking config must be an EmbeddingLinking instance")
         self.cnf_l: EmbeddingLinking = config.components.linking
@@ -41,6 +45,7 @@ def __init__(self, cdb: CDB, config: Config) -> None:
         super().__init__(
             cdb,
             config,
+            tokenizer,
             model_init_kwargs=model_init_kwargs,
         )
         self.training_batch: list[tuple] = []
@@ -407,7 +412,7 @@ def create_new_component(
         vocab: Vocab,
         model_load_path: Optional[str],
     ) -> "Linker":
-        return cls(cdb, cdb.config)
+        return cls(cdb, cdb.config, tokenizer)
 
     def serialise_to(self, folder_path: str) -> None:
         os.makedirs(folder_path, exist_ok=True)
@@ -424,7 +429,8 @@ def deserialise_from(
         cls, folder_path: str, **init_kwargs
     ) -> "Linker":
         cdb = init_kwargs["cdb"]
-        linker = cls(cdb, cdb.config)
+        tokenizer = init_kwargs["tokenizer"]
+        linker = cls(cdb, cdb.config, tokenizer)
 
         model_state_path = os.path.join(
             folder_path, cls._MODEL_FOLDER_NAME, cls._MODEL_STATE_FILE_NAME

diff --git a/medcat-plugins/embedding-linker/src/medcat_embedding_linker/transformer_context_model.py b/medcat-plugins/embedding-linker/src/medcat_embedding_linker/transformer_context_model.py
@@ -3,6 +3,7 @@
 from medcat.storage.serialisables import AbstractSerialisable
 from torch import Tensor, nn
 from transformers import AutoModel, AutoTokenizer
+from medcat_embedding_linker.config import EmbeddingLinking as LinkingConfig
 from tqdm import tqdm
 import json
 import logging
@@ -23,14 +24,16 @@ class ModelForEmbeddingLinking(nn.Module):
     def __init__(
         self,
         embedding_model_name: str,
+        cnf_l: LinkingConfig,
         use_projection_layer: bool = False,
-        top_n_layers_to_unfreeze: int = -1,
+        top_n_layers_to_unfreeze: int = 0,
         device: Optional[Union[str, torch.device]] = None,
     ) -> None:
         super().__init__()
         self.language_model = AutoModel.from_pretrained(embedding_model_name)
         self.base_model_name = self.language_model.name_or_path
 
+        self.cnf_l = cnf_l
         self.use_projection_layer = use_projection_layer
         self.top_n_layers_to_unfreeze = top_n_layers_to_unfreeze
 
@@ -86,6 +89,10 @@ def _freeze_all_parameters(self) -> None:
                 param.requires_grad = True
 
     def unfreeze_top_n_lm_layers(self, n: int) -> None:
+        self.cnf_l.top_n_layers_to_unfreeze = n
+        self.top_n_layers_to_unfreeze = n
+        # Re-apply from a known baseline so repeated calls are deterministic.
+        self._freeze_all_parameters()
         # train all LM layers - each layer requires more data
         if n == -1:
             for param in self.language_model.parameters():
@@ -133,6 +140,7 @@ def save_pretrained(self, save_directory: Union[str, Path]) -> None:
     def from_pretrained(
         cls,
         path_or_model_name: Union[str, Path],
+        cnf_l: LinkingConfig,
         device: Optional[Union[str, torch.device]] = None,
         **kwargs,
     ) -> "ModelForEmbeddingLinking":
@@ -147,7 +155,7 @@ def from_pretrained(
                 config = json.load(f)
 
             config.update(kwargs)
-            model = cls(**config)
+            model = cls(cnf_l=cnf_l, **config)
             state_dict = torch.load(weights_path, map_location="cpu")
             model.load_state_dict(state_dict)
             model.to(target_device)
@@ -156,6 +164,7 @@ def from_pretrained(
         # Hugging Face model id/path.
         model = cls(
             embedding_model_name=str(path_or_model_name),
+            cnf_l=cnf_l,
             device=target_device,
             **kwargs,
         )
@@ -208,8 +217,19 @@ def _resolve_model_source(path_or_model_name: Union[str, Path]) -> str:
         return str(path_or_model_name)
 
     def _get_model_init_kwargs(self) -> dict[str, Any]:
-        """Build kwargs passed to ModelForEmbeddingLinking.from_pretrained."""
-        return dict(self._model_init_kwargs)
+        """Build kwargs passed to ModelForEmbeddingLinking.from_pretrained.
+
+        Keep these in sync with runtime linker config so model swaps preserve
+        trainability settings (i.e. top-n LM layers to unfreeze).
+        """
+        kwargs = dict(self._model_init_kwargs)
+        if hasattr(self.cnf_l, "use_projection_layer"):
+            kwargs["use_projection_layer"] = self.cnf_l.use_projection_layer
+        if hasattr(self.cnf_l, "top_n_layers_to_unfreeze"):
+            kwargs["top_n_layers_to_unfreeze"] = (
+                self.cnf_l.top_n_layers_to_unfreeze
+            )
+        return kwargs
 
     def load_transformers(self, embedding_model_name: Union[str, Path]) -> None:
         """Load tokenizer/model from local path or Hugging Face model id."""
@@ -224,7 +244,9 @@ def load_transformers(self, embedding_model_name: Union[str, Path]) -> None:
             self.cnf_l.embedding_model_name = str(embedding_model_name)
             self.tokenizer = AutoTokenizer.from_pretrained(model_source)
             self.model = ModelForEmbeddingLinking.from_pretrained(
-                model_source, **model_init_kwargs
+                model_source,
+                cnf_l=self.cnf_l,
+                **model_init_kwargs,
             )
             self.model.eval()
             self.device = torch.device(

diff --git a/medcat-plugins/embedding-linker/tests/test_embedding_linker.py b/medcat-plugins/embedding-linker/tests/test_embedding_linker.py
@@ -67,7 +67,8 @@ class NonTrainableEmbeddingLinkerTests(unittest.TestCase):
     cnf = Config()
     cnf.components.linking = embedding_linker.EmbeddingLinking()
     cnf.components.linking.comp_name = embedding_linker.Linker.name
-    linker = embedding_linker.Linker(FakeCDB(cnf), cnf)
+    vtokenizer = FakeTokenizer()
+    linker = embedding_linker.Linker(FakeCDB(cnf), cnf, vtokenizer)
 
     def test_linker_is_not_trainable(self):
         self.assertNotIsInstance(self.linker, TrainableComponent)
@@ -83,7 +84,8 @@ class TrainableEmbeddingLinkerTests(unittest.TestCase):
     cnf.components.linking.comp_name = (
         trainable_embedding_linker.Linker.name
     )
-    linker = trainable_embedding_linker.Linker(FakeCDB(cnf), cnf)
+    vtokenizer = FakeTokenizer()
+    linker = trainable_embedding_linker.Linker(FakeCDB(cnf), cnf, vtokenizer)
 
     def test_linker_is_trainable(self):
         self.assertIsInstance(self.linker, TrainableComponent)

diff --git a/medcat-plugins/rawstring-tokenizer/README.md b/medcat-plugins/rawstring-tokenizer/README.md
@@ -0,0 +1,38 @@
+# MedCAT-gliner
+
+This provides [gliner](https://github.com/urchade/GLiNER) based NER step for MedCAT core library.
+
+# Usage
+
+First install from PyPI, e.g:
+```
+pip install medcat-gliner
+```
+Subsequently, if you have an existing model, you should be able to just change the NER component:
+```
+cat = CAT.load_model_pack("path/to/existing/model")
+# change component
+from medcat_gliner import GLiNERConfig
+cat.config.components.ner.comp_name = "gliner_ner"
+cat.config.components.ner.custom_cnf = GLiNERConfig()
+# recreate pipe with new NER component
+cat._recreate_pipe()
+# use as needed
+```
+
+## NER recall comparison (linkable SNOMED entities)
+
+The following results compare the existing NER (vocab based NER with spell checking) implementation with the gliner implementation when used as the NER component within MedCAT.
+Evaluation was performed on the **2023 SNOMED CT Linking Challenge** dataset.
+
+> **Important caveat**
+> This is **not a measure of general NER quality**.
+> Recall is computed only with respect to annotated, linkable SNOMED CT entities present in the linking dataset.
+> Mentions outside the annotation scope are treated as false positives by construction, so precision is not meaningful here.
+
+| Implementation         | True Positives | False Negatives | Recall | Runtime |
+| ---------------------- | -------------- | --------------- | ------ | ------- |
+| Vocab based NER        | 10,545         | 3,917           | 0.729  | ~5m 50s |
+| GliNER implementation  | 7,971          | 6,491           | 0.551  | ~34m    |
+
+As we can see, for this dataset, GliNER is significantly slower and performs worse than the standard vocab based implementation. This is likely because the vocab based NER step has been configured and tuned to work best within the MedCAT pipeline. It is likely that with additional tuning the GliNER implementation could perform as good or better than the vocab based linker does.
diff --git a/medcat-plugins/rawstring-tokenizer/pyproject.toml b/medcat-plugins/rawstring-tokenizer/pyproject.toml
@@ -0,0 +1,113 @@
+[project]
+name = "medcat-rawstring-tokenzier"
+
+dynamic = ["version"]
+
+description = "Rawstring tokenizer for MedCAT"
+
+readme = "README.md"
+
+requires-python = ">=3.10"
+
+license = {text = "Apache-2.0"}
+
+keywords = ["ML", "NLP", "NER+L"]
+
+authors = [
+  {name = "A. Sutton"},
+  {name = "M. Ratas"},
+]
+
+# This should be your name or the names of the organization who currently
+# maintains the project, and a valid email address corresponding to the name
+# listed.
+maintainers = [
+  {name = "CogStack", email = "contact@cogstack.org" }
+]
+
+classifiers = [
+  # How mature is this project? Common values are
+  #   3 - Alpha
+  #   4 - Beta
+  #   5 - Production/Stable
+  "Development Status :: 5 - Production/Stable",
+
+  "Intended Audience :: Healthcare Industry",
+  # "Topic :: Natural Language Processing :: Named Entity Recognition and Linking",
+
+  # Specify the Python versions you support here. In particular, ensure
+  # that you indicate you support Python 3. These classifiers are *not*
+  # checked by "pip install". See instead "python_requires" below.
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+  "Programming Language :: Python :: 3 :: Only",
+  "Operating System :: OS Independent",
+]
+
+# This field lists other packages that your project depends on to run.
+# Any package you put here will be installed by pip when your project is
+# installed, so they must be valid existing projects.
+#
+# For an analysis of this field vs pip's requirements files see:
+# https://packaging.python.org/discussions/install-requires-vs-requirements/
+dependencies = [
+  "medcat[spacy]>=2.7",
+]
+
+# List additional groups of dependencies here (e.g. development
+# dependencies). Users will be able to install these using the "extras"
+# syntax, for example:
+#
+#   $ pip install sampleproject[dev]
+#
+# Similar to `dependencies` above, these must be valid existing
+# projects.
+[project.optional-dependencies] # Optional
+dev = [
+  "ruff~=0.1.7",
+  "mypy",
+  "types-tqdm",
+  "types-setuptools",
+  "types-PyYAML",
+]
+
+# entry-points to add onto medcat
+[project.entry-points."medcat.plugins"]
+medcat_rawstring_tokenizer = "medcat_rawstring_tokenizer"
+
+[project.urls]
+"Homepage" = "https://cogstack.org/"
+"Bug Reports" = "https://discourse.cogstack.org/"
+"Source" = "https://github.com/CogStack/cogstack-nlp/tree/main/medcat-plugins/rawstring-tokenizer"
+
+[build-system]
+# These are the assumed default build requirements from pip:
+# https://pip.pypa.io/en/stable/reference/pip/#pep-517-and-518-support
+requires = ["setuptools>=43.0.0", "setuptools_scm>=8", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools]
+package-dir = {"" = "src"}
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools.package-data]
+"medcat_rawstring_tokenizer" = ["py.typed"]
+
+[tool.setuptools_scm]
+# look for .git folder in root of repo
+root = "../.."
+version_scheme = "post-release"
+local_scheme = "no-local-version"
+tag_regex = "^medcat-rawstring-tokenizer/v(?P<version>\\d+(?:\\.\\d+)*)(?:[ab]\\d+|rc\\d+)?$"
+git_describe_command = "git describe --dirty --tags --long --match 'medcat-rawstring-tokenizer/v*'"
+
+[tool.ruff.lint]
+# 1. Enable some extra checks for ruff
+select = ["E", "F"]
+# ignore unused local variables
+ignore = ["F841"]
diff --git a/medcat-plugins/rawstring-tokenizer/src/medcat_rawstring_tokenizer/__init__.py b/medcat-plugins/rawstring-tokenizer/src/medcat_rawstring_tokenizer/__init__.py