pmt: revert 7f8090b
This commit is contained in:
@@ -1,376 +0,0 @@
|
||||
#!/usr/bin/env -S clisp -C
|
||||
|
||||
;;; Creation of gnulib's uninames.h from the UnicodeData.txt and NameAliases.txt
|
||||
;;; tables.
|
||||
|
||||
;;; Copyright (C) 2000-2024 Free Software Foundation, Inc.
|
||||
;;; Written by Bruno Haible <bruno@clisp.org>, 2000-12-28.
|
||||
;;;
|
||||
;;; This program is free software.
|
||||
;;; It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
|
||||
;;; You can redistribute it and/or modify it under either
|
||||
;;; - the terms of the GNU Lesser General Public License as published
|
||||
;;; by the Free Software Foundation, either version 3, or (at your
|
||||
;;; option) any later version, or
|
||||
;;; - the terms of the GNU General Public License as published by the
|
||||
;;; Free Software Foundation; either version 2, or (at your option)
|
||||
;;; any later version, or
|
||||
;;; - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
|
||||
;;;
|
||||
;;; This program is distributed in the hope that it will be useful,
|
||||
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;;; Lesser General Public License and the GNU General Public License
|
||||
;;; for more details.
|
||||
;;;
|
||||
;;; You should have received a copy of the GNU Lesser General Public
|
||||
;;; License and of the GNU General Public License along with this
|
||||
;;; program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
(defparameter add-comments nil)
|
||||
|
||||
(defstruct unicode-char
|
||||
(index nil :type integer)
|
||||
(name nil :type string)
|
||||
word-indices
|
||||
word-indices-index
|
||||
)
|
||||
|
||||
(defstruct range
|
||||
(index nil :type integer)
|
||||
(start-code nil :type integer)
|
||||
(end-code nil :type integer)
|
||||
)
|
||||
|
||||
(defstruct word-list
|
||||
(hashed nil :type hash-table)
|
||||
(sorted nil :type list)
|
||||
size ; number of characters total
|
||||
length ; number of words
|
||||
)
|
||||
|
||||
(defun main (inputfile aliasfile outputfile)
|
||||
(declare (type string inputfile aliasfile outputfile))
|
||||
#+UNICODE (setq *default-file-encoding* charset:utf-8)
|
||||
(let ((all-chars '())
|
||||
(all-chars-hashed (make-hash-table :test #'equal))
|
||||
(all-aliases '())
|
||||
all-chars-and-aliases
|
||||
(all-ranges '())
|
||||
(name-index 0)
|
||||
range)
|
||||
;; Read all characters and names from the input file.
|
||||
(with-open-file (istream inputfile :direction :input)
|
||||
(loop
|
||||
(let ((line (read-line istream nil nil)))
|
||||
(unless line (return))
|
||||
(let* ((i1 (position #\; line))
|
||||
(i2 (position #\; line :start (1+ i1)))
|
||||
(code-string (subseq line 0 i1))
|
||||
(code (parse-integer code-string :radix 16))
|
||||
(name-string (subseq line (1+ i1) i2)))
|
||||
; Ignore characters whose name starts with "<".
|
||||
(unless (eql (char name-string 0) #\<)
|
||||
; Also ignore Hangul syllables; they are treated specially.
|
||||
(unless (<= #xAC00 code #xD7A3)
|
||||
; Also ignore CJK compatibility ideographs; they are treated
|
||||
; specially as well.
|
||||
(unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
|
||||
(<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D))
|
||||
;; Also ignore variationselectors; they are treated
|
||||
;; specially as well.
|
||||
(unless (or (<= #xFE00 code #xFE0F) (<= #xE0100 code #xE01EF))
|
||||
(push (make-unicode-char :index name-index
|
||||
:name name-string)
|
||||
all-chars
|
||||
)
|
||||
(setf (gethash code all-chars-hashed) (car all-chars))
|
||||
;; Update the contiguous range, or start a new range.
|
||||
(if (and range (= (1+ (range-end-code range)) code))
|
||||
(setf (range-end-code range) code)
|
||||
(progn
|
||||
(when range
|
||||
(push range all-ranges))
|
||||
(setq range (make-range :index name-index
|
||||
:start-code code
|
||||
:end-code code))))
|
||||
(incf name-index)
|
||||
) ) ) )
|
||||
) ) ) )
|
||||
(setq all-chars (nreverse all-chars))
|
||||
(if range
|
||||
(push range all-ranges))
|
||||
(setq all-ranges (nreverse all-ranges))
|
||||
(when aliasfile
|
||||
;; Read all characters and names from the alias file.
|
||||
(with-open-file (istream aliasfile :direction :input)
|
||||
(loop
|
||||
(let ((line (read-line istream nil nil)))
|
||||
(unless line (return))
|
||||
(unless (or (equal line "") (equal (subseq line 0 1) "#"))
|
||||
(let* ((i1 (position #\; line))
|
||||
(i2 (position #\; line :start (1+ i1)))
|
||||
(code-string (subseq line 0 i1))
|
||||
(code (parse-integer code-string :radix 16))
|
||||
(name-string (subseq line (1+ i1) i2))
|
||||
(uc (gethash code all-chars-hashed)))
|
||||
(when uc
|
||||
(push (make-unicode-char :index (unicode-char-index uc)
|
||||
:name name-string)
|
||||
all-aliases
|
||||
) ) ) ) ) ) ) )
|
||||
(setq all-aliases (nreverse all-aliases)
|
||||
all-chars-and-aliases (append all-chars all-aliases)
|
||||
)
|
||||
;; Split into words.
|
||||
(let ((words-by-length (make-array 0 :adjustable t)))
|
||||
(dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" "VARIATION"
|
||||
(mapcar #'unicode-char-name all-chars-and-aliases)))
|
||||
(let ((i1 0))
|
||||
(loop
|
||||
(when (>= i1 (length name)) (return))
|
||||
(let ((i2 (or (position #\Space name :start i1) (length name))))
|
||||
(let* ((word (subseq name i1 i2))
|
||||
(len (length word)))
|
||||
(when (>= len (length words-by-length))
|
||||
(adjust-array words-by-length (1+ len))
|
||||
)
|
||||
(unless (aref words-by-length len)
|
||||
(setf (aref words-by-length len)
|
||||
(make-word-list
|
||||
:hashed (make-hash-table :test #'equal)
|
||||
:sorted '()
|
||||
) ) )
|
||||
(let ((word-list (aref words-by-length len)))
|
||||
(unless (gethash word (word-list-hashed word-list))
|
||||
(setf (gethash word (word-list-hashed word-list)) t)
|
||||
(push word (word-list-sorted word-list))
|
||||
) )
|
||||
)
|
||||
(setq i1 (1+ i2))
|
||||
) ) ) )
|
||||
;; Sort the word lists.
|
||||
(dotimes (len (length words-by-length))
|
||||
(unless (aref words-by-length len)
|
||||
(setf (aref words-by-length len)
|
||||
(make-word-list
|
||||
:hashed (make-hash-table :test #'equal)
|
||||
:sorted '()
|
||||
) ) )
|
||||
(let ((word-list (aref words-by-length len)))
|
||||
(setf (word-list-sorted word-list)
|
||||
(sort (word-list-sorted word-list) #'string<)
|
||||
)
|
||||
(setf (word-list-size word-list)
|
||||
(reduce #'+ (mapcar #'length (word-list-sorted word-list)))
|
||||
)
|
||||
(setf (word-list-length word-list)
|
||||
(length (word-list-sorted word-list))
|
||||
) ) )
|
||||
;; Output the tables.
|
||||
(with-open-file (ostream outputfile :direction :output
|
||||
#+UNICODE :external-format #+UNICODE charset:ascii)
|
||||
(format ostream "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */~%")
|
||||
(format ostream "/*~%")
|
||||
(format ostream " * ~A~%" (file-namestring outputfile))
|
||||
(format ostream " *~%")
|
||||
(format ostream " * Unicode character name table.~%")
|
||||
(format ostream " * Generated automatically by the gen-uninames utility.~%")
|
||||
(format ostream " */~%")
|
||||
(format ostream "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.~%")
|
||||
(format ostream "~%")
|
||||
(format ostream " This file is free software.~%")
|
||||
(format ostream " It is dual-licensed under \"the GNU LGPLv3+ or the GNU GPLv2+\".~%")
|
||||
(format ostream " You can redistribute it and/or modify it under either~%")
|
||||
(format ostream " - the terms of the GNU Lesser General Public License as published~%")
|
||||
(format ostream " by the Free Software Foundation, either version 3, or (at your~%")
|
||||
(format ostream " option) any later version, or~%")
|
||||
(format ostream " - the terms of the GNU General Public License as published by the~%")
|
||||
(format ostream " Free Software Foundation; either version 2, or (at your option)~%")
|
||||
(format ostream " any later version, or~%")
|
||||
(format ostream " - the same dual license \"the GNU LGPLv3+ or the GNU GPLv2+\".~%")
|
||||
(format ostream "~%")
|
||||
(format ostream " This file is distributed in the hope that it will be useful,~%")
|
||||
(format ostream " but WITHOUT ANY WARRANTY; without even the implied warranty of~%")
|
||||
(format ostream " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU~%")
|
||||
(format ostream " Lesser General Public License and the GNU General Public License~%")
|
||||
(format ostream " for more details.~%")
|
||||
(format ostream "~%")
|
||||
(format ostream " You should have received a copy of the GNU Lesser General Public~%")
|
||||
(format ostream " License and of the GNU General Public License along with this~%")
|
||||
(format ostream " program. If not, see <https://www.gnu.org/licenses/>. */~%")
|
||||
(format ostream "~%")
|
||||
(format ostream "static const char unicode_name_words[~D] = {~%"
|
||||
(let ((sum 0))
|
||||
(dotimes (len (length words-by-length))
|
||||
(let ((word-list (aref words-by-length len)))
|
||||
(incf sum (word-list-size word-list))
|
||||
) )
|
||||
sum
|
||||
) )
|
||||
(dotimes (len (length words-by-length))
|
||||
(let ((word-list (aref words-by-length len)))
|
||||
(dolist (word (word-list-sorted word-list))
|
||||
(format ostream " ~{ '~C',~}~%" (coerce word 'list))
|
||||
) ) )
|
||||
(format ostream "};~%")
|
||||
(format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%"
|
||||
(let ((sum 0))
|
||||
(dotimes (len (length words-by-length))
|
||||
(let ((word-list (aref words-by-length len)))
|
||||
(incf sum (word-list-length word-list))
|
||||
) )
|
||||
sum
|
||||
) )
|
||||
#| ; Redundant data
|
||||
(format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%"
|
||||
(let ((sum 0))
|
||||
(dotimes (len (length words-by-length))
|
||||
(let ((word-list (aref words-by-length len)))
|
||||
(incf sum (word-list-length word-list))
|
||||
) )
|
||||
sum
|
||||
) )
|
||||
(dotimes (len (length words-by-length))
|
||||
(let ((word-list (aref words-by-length len)))
|
||||
(when (word-list-sorted word-list)
|
||||
(format ostream " ")
|
||||
(do ((l (word-list-sorted word-list) (cdr l))
|
||||
(offset 0 (+ offset (length (car l)))))
|
||||
((endp l))
|
||||
(format ostream "~<~% ~0,79:; ~D,~>" offset)
|
||||
)
|
||||
(format ostream "~%")
|
||||
) ) )
|
||||
(format ostream "};~%")
|
||||
|#
|
||||
(format ostream "static const struct { uint32_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
|
||||
(1+ (length words-by-length))
|
||||
)
|
||||
(let ((extra-offset 0)
|
||||
(ind-offset 0))
|
||||
(dotimes (len (length words-by-length))
|
||||
(let ((word-list (aref words-by-length len)))
|
||||
(format ostream " { ~D, ~D },~%" extra-offset ind-offset)
|
||||
(incf extra-offset (word-list-size word-list))
|
||||
(incf ind-offset (word-list-length word-list))
|
||||
) )
|
||||
(format ostream " { ~D, ~D }~%" extra-offset ind-offset)
|
||||
)
|
||||
(format ostream "};~%")
|
||||
(let ((ind-offset 0))
|
||||
(dotimes (len (length words-by-length))
|
||||
(let ((word-list (aref words-by-length len)))
|
||||
(dolist (word (word-list-sorted word-list))
|
||||
(setf (gethash word (word-list-hashed word-list)) ind-offset)
|
||||
(incf ind-offset)
|
||||
) ) ) )
|
||||
(dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY" "VARIATION"))
|
||||
(format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word
|
||||
(gethash word (word-list-hashed (aref words-by-length (length word))))
|
||||
) )
|
||||
;; Compute the word-indices for every unicode-char.
|
||||
(dolist (uc all-chars-and-aliases)
|
||||
(let ((name (unicode-char-name uc))
|
||||
(indices '()))
|
||||
(let ((i1 0))
|
||||
(loop
|
||||
(when (>= i1 (length name)) (return))
|
||||
(let ((i2 (or (position #\Space name :start i1) (length name))))
|
||||
(let* ((word (subseq name i1 i2))
|
||||
(len (length word)))
|
||||
(push (gethash word (word-list-hashed (aref words-by-length len)))
|
||||
indices
|
||||
)
|
||||
)
|
||||
(setq i1 (1+ i2))
|
||||
) ) )
|
||||
(setf (unicode-char-word-indices uc)
|
||||
(coerce (nreverse indices) 'vector)
|
||||
)
|
||||
) )
|
||||
;; Sort the list of unicode-chars by word-indices.
|
||||
(setq all-chars-and-aliases
|
||||
(sort all-chars-and-aliases
|
||||
(lambda (vec1 vec2)
|
||||
(let ((len1 (length vec1))
|
||||
(len2 (length vec2)))
|
||||
(do ((i 0 (1+ i)))
|
||||
(nil)
|
||||
(if (< i len2)
|
||||
(if (< i len1)
|
||||
(cond ((< (aref vec1 i) (aref vec2 i)) (return t))
|
||||
((> (aref vec1 i) (aref vec2 i)) (return nil))
|
||||
)
|
||||
(return t)
|
||||
)
|
||||
(return nil)
|
||||
) ) ) )
|
||||
:key #'unicode-char-word-indices
|
||||
) )
|
||||
;; Output the word-indices.
|
||||
(format ostream "static const uint16_t unicode_names[~D] = {~%"
|
||||
(reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases))
|
||||
)
|
||||
(let ((i 0))
|
||||
(dolist (uc all-chars-and-aliases)
|
||||
(format ostream " ~{ ~D,~}"
|
||||
(maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
|
||||
(coerce (unicode-char-word-indices uc) 'list)
|
||||
)
|
||||
)
|
||||
(when add-comments
|
||||
(format ostream "~40T/* ~A */" (unicode-char-name uc))
|
||||
)
|
||||
(format ostream "~%")
|
||||
(setf (unicode-char-word-indices-index uc) i)
|
||||
(incf i (length (unicode-char-word-indices uc)))
|
||||
) )
|
||||
(format ostream "};~%")
|
||||
(format ostream "static const struct { uint16_t index; uint32_t name:24; } ATTRIBUTE_PACKED unicode_name_to_index[~D] = {~%"
|
||||
(length all-chars-and-aliases)
|
||||
)
|
||||
(dolist (uc all-chars-and-aliases)
|
||||
(format ostream " { 0x~4,'0X, ~D },"
|
||||
(unicode-char-index uc)
|
||||
(unicode-char-word-indices-index uc)
|
||||
)
|
||||
(when add-comments
|
||||
(format ostream "~21T/* ~A */" (unicode-char-name uc))
|
||||
)
|
||||
(format ostream "~%")
|
||||
)
|
||||
(format ostream "};~%")
|
||||
(format ostream "static const struct { uint16_t index; uint32_t name:24; } ATTRIBUTE_PACKED unicode_index_to_name[~D] = {~%"
|
||||
(length all-chars)
|
||||
)
|
||||
(dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-index))
|
||||
(format ostream " { 0x~4,'0X, ~D },"
|
||||
(unicode-char-index uc)
|
||||
(unicode-char-word-indices-index uc)
|
||||
)
|
||||
(when add-comments
|
||||
(format ostream "~21T/* ~A */" (unicode-char-name uc))
|
||||
)
|
||||
(format ostream "~%")
|
||||
)
|
||||
(format ostream "};~%")
|
||||
(format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
|
||||
(reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars-and-aliases))
|
||||
)
|
||||
(format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
|
||||
(reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases))
|
||||
)
|
||||
(format ostream "static const struct { uint16_t index; uint32_t gap; uint16_t length; } unicode_ranges[~D] = {~%"
|
||||
(length all-ranges))
|
||||
(dolist (range all-ranges)
|
||||
(format ostream " { ~D, ~D, ~D },~%"
|
||||
(range-index range)
|
||||
(- (range-start-code range) (range-index range))
|
||||
(1+ (- (range-end-code range) (range-start-code range))))
|
||||
)
|
||||
(format ostream "};~%")
|
||||
)
|
||||
) ) )
|
||||
|
||||
(main (first *args*) (second *args*) (third *args*))
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user