Overview
Comment: | smaz-tools: add a procedure to help evaluate and improve dictionaries |
---|---|
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
3c8fd02a5c318a0f0c74a761488a7be1 |
User & Date: | nat on 2016-10-06 17:26:42 |
Other Links: | manifest | tags |
Context
2016-10-07
| ||
16:01 | tools/smaz: first draft of CLI interface for Evaluate_Dictionary check-in: a27f42e127 user: nat tags: trunk | |
2016-10-06
| ||
17:26 | smaz-tools: add a procedure to help evaluate and improve dictionaries check-in: 3c8fd02a5c user: nat tags: trunk | |
2016-10-05
| ||
14:49 | tools/smaz: add a command-line option for maximum word size check-in: 2995e1835c user: nat tags: trunk | |
Changes
Modified src/natools-smaz-tools.adb from [a336f10fe0] to [627ca11783].
︙ | |||
432 433 434 435 436 437 438 439 440 441 442 443 444 445 | 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + | if Next - First in Min_Size .. Max_Size then Add_Word (Counter, Phrase (First .. Next - 1)); end if; end loop Main_Loop; end Add_Words; procedure Evaluate_Dictionary (Dict : in Dictionary; Corpus : in String_Lists.List; Compressed_Size : out Ada.Streams.Stream_Element_Count; Counts : out Dictionary_Counts) is Verbatim_Code_Count : constant Ada.Streams.Stream_Element_Offset := Ada.Streams.Stream_Element_Offset (Ada.Streams.Stream_Element'Last - Dict.Dict_Last); Verbatim_Length : Ada.Streams.Stream_Element_Offset; Input_Byte : Ada.Streams.Stream_Element; begin Compressed_Size := 0; for I in Counts'Range loop Counts (I) := 0; end loop; for S of Corpus loop declare use type Ada.Streams.Stream_Element_Offset; Compressed : constant Ada.Streams.Stream_Element_Array := Compress (Dict, S); Index : Ada.Streams.Stream_Element_Offset := Compressed'First; begin Compressed_Size := Compressed_Size + Compressed'Length; while Index in Compressed'Range loop Input_Byte := Compressed (Index); if Input_Byte in Dict.Offsets'Range then Counts (Input_Byte) := Counts (Input_Byte) + 1; Index := Index + 1; else if not Dict.Variable_Length_Verbatim then Verbatim_Length := Ada.Streams.Stream_Element_Offset (Ada.Streams.Stream_Element'Last - Input_Byte) + 1; elsif Input_Byte < Ada.Streams.Stream_Element'Last then Verbatim_Length := Ada.Streams.Stream_Element_Offset (Ada.Streams.Stream_Element'Last - Input_Byte); else Index := Index + 1; Verbatim_Length := Ada.Streams.Stream_Element_Offset (Compressed (Index)) + Verbatim_Code_Count - 1; end if; Index := Index + Verbatim_Length + 1; end if; end loop; end; end loop; end Evaluate_Dictionary; function Simple_Dictionary (Counter : in Word_Counter; Word_Count : in Natural) return String_Lists.List is use type Ada.Containers.Count_Type; |
︙ |
Modified src/natools-smaz-tools.ads from [a92069b766] to [d2561626f6].
︙ | |||
100 101 102 103 104 105 106 107 108 109 110 111 112 113 | 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | + + + + + + + + + + + + | function Simple_Dictionary (Counter : in Word_Counter; Word_Count : in Natural) return String_Lists.List; -- Return the Word_Count words in Counter that have the highest score, -- the score being count * length. type Dictionary_Counts is array (Ada.Streams.Stream_Element) of String_Count; procedure Evaluate_Dictionary (Dict : in Dictionary; Corpus : in String_Lists.List; Compressed_Size : out Ada.Streams.Stream_Element_Count; Counts : out Dictionary_Counts); -- Compress all strings of Corpus, returning the total number of -- compressed bytes and the number of uses for each dictionary -- element. private package Word_Maps is new Ada.Containers.Indefinite_Ordered_Maps (String, String_Count); type Word_Counter is record |
︙ |