LocalDoc/pii_ner_azerbaijani_extended
Viewer • Updated • 531k • 69
How to use LocalDoc/pii-ner-azerbaijani-v3 with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("token-classification", model="LocalDoc/pii-ner-azerbaijani-v3") # Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification
tokenizer = AutoTokenizer.from_pretrained("LocalDoc/pii-ner-azerbaijani-v3")
model = AutoModelForTokenClassification.from_pretrained("LocalDoc/pii-ner-azerbaijani-v3")A high-accuracy Named Entity Recognition model for detecting Personally Identifiable Information (PII) in Azerbaijani text. Built on LocalDoc/mmBERT-small-en-az (ModernBERT architecture), this model is 4x smaller and faster than XLM-RoBERTa while achieving higher accuracy.
Şərifova and Sherifova| Metric | Value |
|---|---|
| Base Model | LocalDoc/mmBERT-small-en-az |
| Architecture | ModernBERT (22 layers, hidden=384) |
| Parameters | 69M |
| Model Size (fp32) | 0.26 GB |
| Max Sequence Length | 8,192 tokens |
| Training Data | LocalDoc/pii_ner_azerbaijani_extended (530K rows) |
| Training Epochs | 5 (best at epoch 5) |
| License | MIT |
| Metric | This Model (69M) | XLM-RoBERTa v2 (278M) |
|---|---|---|
| F1 | 0.9974 | 0.9746 |
| Precision | 0.9967 | 0.9760 |
| Recall | 0.9982 | 0.9732 |
| False Positives (hard neg) | 1 | 4 |
| Entity | F1 | Entity | F1 |
|---|---|---|---|
| GIVENNAME | 0.9974 | PASSPORTNUM | 0.9996 |
| SURNAME | 0.9980 | TAXNUM | 0.9994 |
| 0.9978 | TELEPHONENUM | 0.9993 | |
| DATE | 0.9936 | TIME | 0.9993 |
| AGE | 0.9965 | CREDITCARDNUMBER | 0.9948 |
| CITY | 0.9967 | STREET | 0.9926 |
| IDCARDNUM | 0.9985 | BUILDINGNUM | 0.9976 |
| ZIPCODE | 0.9978 |
| Epoch | Loss | F1 | Precision | Recall |
|---|---|---|---|---|
| 1 | 0.0159 | 0.9839 | 0.9794 | 0.9889 |
| 2 | 0.0099 | 0.9877 | 0.9848 | 0.9908 |
| 3 | 0.0053 | 0.9949 | 0.9931 | 0.9967 |
| 4 | 0.0038 | 0.9972 | 0.9964 | 0.9980 |
| 5 | 0.0041 | 0.9974 | 0.9967 | 0.9982 |
GIVENNAME — First name (e.g., "Əli", "Aysel")
SURNAME — Last name (e.g., "Həsənov", "Məmmədova")
EMAIL — Email address
TELEPHONENUM — Phone number
DATE — Date in various formats
TIME — Time
AGE — Age
IDCARDNUM — ID card / FIN number
PASSPORTNUM — Passport number
TAXNUM — Tax identification number
CREDITCARDNUMBER — Credit card number
CITY — City name (as address, not adjective)
STREET — Street name
BUILDINGNUM — Building number
ZIPCODE — ZIP/postal code
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
class AzerbaijaniPiiNer:
def __init__(self, model_name="LocalDoc/pii-ner-azerbaijani-v3"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForTokenClassification.from_pretrained(model_name)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device).eval()
self.id2label = self.model.config.id2label
def predict(self, text: str) -> list[dict]:
"""
Detect PII entities in text.
Input is lowercased for the model, but original casing is preserved in output.
"""
original_text = text
text_lower = text.lower()
inputs = self.tokenizer(
text_lower,
return_tensors="pt",
return_offsets_mapping=True,
return_special_tokens_mask=True,
truncation=True,
max_length=512,
)
offsets = inputs.pop("offset_mapping")[0]
special_mask = inputs.pop("special_tokens_mask")[0]
inputs = {k: v.to(self.device) for k, v in inputs.items()}
with torch.no_grad():
logits = self.model(**inputs).logits
predictions = torch.argmax(logits, dim=-1)[0].cpu()
# Extract entities
entities = []
current = None
for pred_id, offset, is_special in zip(predictions, offsets, special_mask):
if is_special:
if current:
entities.append(current)
current = None
continue
label = self.id2label[pred_id.item()]
cs, ce = offset[0].item(), offset[1].item()
if label.startswith("B-"):
if current:
entities.append(current)
current = {"label": label[2:], "start": cs, "end": ce}
elif label.startswith("I-") and current and label[2:] == current["label"]:
current["end"] = ce
else:
if current:
entities.append(current)
current = None
if current:
entities.append(current)
# Map back to ORIGINAL text (preserve original casing)
for ent in entities:
raw = original_text[ent["start"]:ent["end"]]
ent["value"] = raw.strip()
if raw != raw.strip():
offset = len(raw) - len(raw.lstrip())
ent["start"] += offset
ent["end"] = ent["start"] + len(ent["value"])
return entities
def anonymize(self, text: str, replacement: str = "***") -> str:
"""Replace all PII entities with a placeholder."""
entities = self.predict(text)
entities.sort(key=lambda x: x["start"], reverse=True)
result = text
for ent in entities:
result = result[:ent["start"]] + replacement + result[ent["end"]:]
return result
def highlight(self, text: str) -> str:
"""Return text with entities marked: [LABEL: value]."""
entities = self.predict(text)
entities.sort(key=lambda x: x["start"], reverse=True)
result = text
for ent in entities:
result = (
result[:ent["start"]]
+ f"[{ent['label']}: {ent['value']}]"
+ result[ent["end"]:]
)
return result
# --- Example ---
if __name__ == "__main__":
ner = AzerbaijaniPiiNer()
examples = [
# Original Azerbaijani
"Hörmətli Əhməd Süleymanlı, 05.03.1987 tarixli müraciətiniz qəbul edildi. Əlaqə: 055-234-67-89.",
# Transliterated (informal)
"Hormetli Ehmed Suleymanlı, 05.03.1987 tarixli muracietiniz qebul edildi. Elaqe: 055-234-67-89.",
# Mixed context with hard negatives
"Bakı küləyi güclüdür, amma Əli Bakıda Nizami küçəsi 42-də yaşayır.",
# Complex document
"Müştəri: Gülarə Məmmədli, 67 yaş. Pasport: AZE 1234567. Email: gulare@mail.az. Tel: 012-456-78-90.",
# English-Azerbaijani mix
"Dear customer Əli Həsənli, your order shipped to Bakı, 28 May küçəsi 12. Contact: ali@company.com.",
]
for text in examples:
print(f"\nInput: {text}")
print(f"Highlight: {ner.highlight(text)}")
print(f"Anonymize: {ner.anonymize(text)}")
for ent in ner.predict(text):
print(f" {ent['label']:20s} → \"{ent['value']}\" ({ent['start']}:{ent['end']})")
Input: Hörmətli Əhməd Süleymanlı, 05.03.1987 tarixli müraciətiniz qəbul edildi. Əlaqə: 055-234-67-89.
Highlight: Hörmətli [GIVENNAME: Əhməd] [SURNAME: Süleymanlı], [DATE: 05.03.1987] tarixli müraciətiniz qəbul edildi. Əlaqə: [TELEPHONENUM: 055-234-67-89].
Anonymize: Hörmətli *** ***, *** tarixli müraciətiniz qəbul edildi. Əlaqə: ***.
GIVENNAME → "Əhməd" (9:14)
SURNAME → "Süleymanlı" (15:25)
DATE → "05.03.1987" (27:37)
TELEPHONENUM → "055-234-67-89" (82:95)
from transformers import pipeline
ner_pipeline = pipeline(
"token-classification",
model="LocalDoc/pii-ner-azerbaijani-v3",
aggregation_strategy="simple",
)
# Important: lowercase the input
text = "Əhməd Həsənov Bakıda yaşayır, telefonu 055-123-45-67."
results = ner_pipeline(text.lower())
for entity in results:
print(f"{entity['entity_group']:20s} → \"{entity['word']}\" (score: {entity['score']:.4f})")
Trained on LocalDoc/pii_ner_azerbaijani_extended (530K rows):
Without hard negatives, the model marks every city name as PII:
With hard negatives, the model learns context:
.lower() before inference055-987-65-43 may split. This is a known tokenizer limitation.| v3 (this) | v2 (XLM-RoBERTa) | v1 (XLM-RoBERTa) | |
|---|---|---|---|
| Base | mmBERT-small | XLM-RoBERTa | XLM-RoBERTa |
| Parameters | 69M | 278M | 278M |
| F1 | 0.9974 | 0.9746 | 0.9629 |
| Hard neg FP | 1 | 4 | not tested |
| Transliteration | yes | no | no |
| Speed | 3-4x faster | 1x | 1x |
@misc{pii-ner-azerbaijani-v3,
title={PII NER Azerbaijani v3},
author={LocalDoc},
year={2025},
publisher={Hugging Face},
url={https://huggingface.co/LocalDoc/pii-ner-azerbaijani-v3}
}
The Creative Commons Attribution 4.0 International (CC BY 4.0) license allows:
For more information, refer to the CC BY 4.0 license.
For questions or issues, contact LocalDoc at [v.resad.89@gmail.com].