guix_mirror_bot pushed a commit to branch master
in repository guix.

commit 9f6f7fa0cb4e1036f10054c51e4803acf5220b8c
Author: Nguyễn Gia Phong <[email protected]>
AuthorDate: Mon Jan 12 13:21:22 2026 +0900

    gnu: Add python-sacremoses.
    
    * gnu/packages/language.scm (python-sacremoses): New variable.
    
    Change-Id: I7fd661f312c0cda107bf7eb79b99ea3f1eba3386
---
 gnu/packages/language.scm | 48 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/gnu/packages/language.scm b/gnu/packages/language.scm
index d448d2ffc8..21e0aaa770 100644
--- a/gnu/packages/language.scm
+++ b/gnu/packages/language.scm
@@ -13,6 +13,7 @@
 ;;; Copyright © 2024 Nicolas Graves <[email protected]>
 ;;; Copyright © 2024, 2025 Zheng Junjie <[email protected]>
 ;;; Copyright © 2025 Janneke Nieuwenhuizen <[email protected]>
+;;; Copyright © 2026 Nguyễn Gia Phong <[email protected]>
 ;;;
 ;;; This file is part of GNU Guix.
 ;;;
@@ -57,6 +58,8 @@
   #:use-module (gnu packages pkg-config)
   #:use-module (gnu packages pulseaudio)
   #:use-module (gnu packages python)
+  #:use-module (gnu packages python-build)
+  #:use-module (gnu packages python-xyz)
   #:use-module (gnu packages perl-check)
   #:use-module (gnu packages qt)
   #:use-module (gnu packages ruby)
@@ -77,6 +80,7 @@
   #:use-module (guix build-system glib-or-gtk)
   #:use-module (guix build-system gnu)
   #:use-module (guix build-system perl)
+  #:use-module (guix build-system pyproject)
   #:use-module (guix build-system qt)
   #:use-module ((guix licenses) #:prefix license:)
   #:use-module (guix download)
@@ -1059,6 +1063,50 @@ from the database are used as entries (heading terms).")
     ;; triple-licensed (at the user’s choice)
     (license (list license:gpl2+ license:lgpl2.1 license:bsd-3))))
 
+(define-public python-sacremoses
+  (package
+    (name "python-sacremoses")
+    (version "0.1.0")
+    (source
+     (origin
+       (method git-fetch)
+       (uri (git-reference
+              (url "https://github.com/hplt-project/sacremoses";)
+              (commit version)))
+       (file-name (git-file-name name version))
+       (sha256
+        (base32 "0g70vchfniknp65n4wnx7chg6g49d4xrz1wagv7f7ir2swdzyn9b"))))
+    (build-system pyproject-build-system)
+    (arguments
+     (let ((norvig-big-txt
+            (origin
+              (method url-fetch)
+              ;; The file is a concatenation of public domain book excerpts
+              ;; from Project Gutenberg and lists of most frequent words
+              ;; from Wiktionary and the British National Corpus:
+              ;; https://norvig.com/spell-correct.html
+              (uri "https://norvig.com/big.txt";)
+              (sha256
+               (base32
+                "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps")))))
+       (list #:phases #~(modify-phases %standard-phases
+                          (add-before 'check 'supply-big-txt
+                            (lambda _
+                              (symlink #$norvig-big-txt "big.txt"))))
+             #:test-backend #~'unittest
+             #:test-flags #~'("discover" "-s" "sacremoses/test"))))
+    (native-inputs (list python-setuptools))
+    (propagated-inputs (list python-click
+                             python-joblib
+                             python-regex
+                             python-tqdm))
+    (home-page "https://github.com/hplt-project/sacremoses";)
+    (synopsis "Natural language tokenizer, truecaser and normalizer")
+    (description
+     "SacreMoses is a Python port of Moses'
+tokenizer, detokenizer, truecaser and punctuation normalizer.")
+    (license license:expat)))
+
 (define-public dparser
   (package
     (name "dparser")

Reply via email to