Overview
Comment: | tools/smaz: implement forced words for optimized dictionary generation |
---|---|
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
014ca1d01b263357f3c6bbca35dcb701 |
User & Date: | nat on 2017-05-18 21:13:36 |
Other Links: | manifest | tags |
Context
2017-05-19
| ||
21:04 | tools/smaz: add command-line options for min and max dictionary size check-in: ecae1e85f4 user: nat tags: trunk | |
2017-05-18
| ||
21:13 | tools/smaz: implement forced words for optimized dictionary generation check-in: 014ca1d01b user: nat tags: trunk | |
2017-05-17
| ||
21:44 | tools/smaz: use the new version of Worst_Index check-in: 6eaac2a01c user: nat tags: trunk | |
Changes
Modified tools/smaz.adb from [e22ccc7ac2] to [4b4875a3ad].
︙ | ︙ | |||
365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 | return Word_Counter; -- Make a word counter from an input word list procedure Optimization_Round (Dict : in out Holders.Holder; Score : in out Ada.Streams.Stream_Element_Count; Counts : in out Dictionary_Counts; Pending_Words : in out String_Lists.List; Input_Texts : in String_Lists.List; Job_Count : in Natural; Method : in Methods; Updated : out Boolean); -- Try to improve on Dict by replacing a single entry from it with -- one of the substring in Pending_Words. function Optimize_Dictionary (Base : in Dictionary; Pending_Words : in String_Lists.List; Input_Texts : in String_Lists.List; Job_Count : in Natural; Method : in Methods) return Dictionary; -- Optimize the dictionary on Input_Texts, starting with Base and | > > | > | 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 | return Word_Counter; -- Make a word counter from an input word list procedure Optimization_Round (Dict : in out Holders.Holder; Score : in out Ada.Streams.Stream_Element_Count; Counts : in out Dictionary_Counts; First : in Dictionary_Entry; Pending_Words : in out String_Lists.List; Input_Texts : in String_Lists.List; Job_Count : in Natural; Method : in Methods; Updated : out Boolean); -- Try to improve on Dict by replacing a single entry from it with -- one of the substring in Pending_Words. function Optimize_Dictionary (Base : in Dictionary; First : in Dictionary_Entry; Pending_Words : in String_Lists.List; Input_Texts : in String_Lists.List; Job_Count : in Natural; Method : in Methods) return Dictionary; -- Optimize the dictionary on Input_Texts, starting with Base and -- adding substrings from Pending_Words. Operates only on words -- at First and beyond. procedure Parallel_Evaluate_Dictionary (Job_Count : in Positive; Dict : in Dictionary; Corpus : in String_Lists.List; Compressed_Size : out Ada.Streams.Stream_Element_Count; Counts : out Dictionary_Counts); |
︙ | ︙ | |||
552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 | end Make_Word_Counter; procedure Optimization_Round (Dict : in out Holders.Holder; Score : in out Ada.Streams.Stream_Element_Count; Counts : in out Dictionary_Counts; Pending_Words : in out String_Lists.List; Input_Texts : in String_Lists.List; Job_Count : in Natural; Method : in Methods; Updated : out Boolean) is use type Ada.Streams.Stream_Element_Offset; New_Value : Ada.Strings.Unbounded.Unbounded_String; New_Position : String_Lists.Cursor; Worst_Index : constant Dictionary_Entry := Worst_Element | > | < | 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 | end Make_Word_Counter; procedure Optimization_Round (Dict : in out Holders.Holder; Score : in out Ada.Streams.Stream_Element_Count; Counts : in out Dictionary_Counts; First : in Dictionary_Entry; Pending_Words : in out String_Lists.List; Input_Texts : in String_Lists.List; Job_Count : in Natural; Method : in Methods; Updated : out Boolean) is use type Ada.Streams.Stream_Element_Offset; New_Value : Ada.Strings.Unbounded.Unbounded_String; New_Position : String_Lists.Cursor; Worst_Index : constant Dictionary_Entry := Worst_Element (Dict.Element, Counts, Method, First, Last_Code (Dict.Element)); Worst_Value : constant String := Dict_Entry (Dict.Element, Worst_Index); Worst_Count : constant String_Count := Counts (Worst_Index); Base : constant Dictionary := Remove_Element (Dict.Element, Worst_Index); Old_Score : constant Ada.Streams.Stream_Element_Count := Score; begin |
︙ | ︙ | |||
621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 | & ')'); end if; end Optimization_Round; function Optimize_Dictionary (Base : in Dictionary; Pending_Words : in String_Lists.List; Input_Texts : in String_Lists.List; Job_Count : in Natural; Method : in Methods) return Dictionary is Holder : Holders.Holder := Holders.To_Holder (Base); Pending : String_Lists.List := Pending_Words; Score : Ada.Streams.Stream_Element_Count; Counts : Dictionary_Counts; Running : Boolean := True; begin Evaluate_Dictionary (Job_Count, Base, Input_Texts, Score, Counts); while Running loop Optimization_Round (Holder, Score, Counts, Pending, Input_Texts, Job_Count, Method, Running); end loop; | > > | 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 | & ')'); end if; end Optimization_Round; function Optimize_Dictionary (Base : in Dictionary; First : in Dictionary_Entry; Pending_Words : in String_Lists.List; Input_Texts : in String_Lists.List; Job_Count : in Natural; Method : in Methods) return Dictionary is Holder : Holders.Holder := Holders.To_Holder (Base); Pending : String_Lists.List := Pending_Words; Score : Ada.Streams.Stream_Element_Count; Counts : Dictionary_Counts; Running : Boolean := True; begin Evaluate_Dictionary (Job_Count, Base, Input_Texts, Score, Counts); while Running loop Optimization_Round (Holder, Score, Counts, First, Pending, Input_Texts, Job_Count, Method, Running); end loop; |
︙ | ︙ | |||
1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 | (Handler, To_Dictionary (Input, Handler.Vlen_Verbatim), Data_List, Method); when Dict_Sources.Text_List => declare Selected, Pending : String_Lists.List; begin Simple_Dictionary_And_Pending (Make_Word_Counter (Handler, Input), | > > > > > > > > > > > > > > < > > > > > > > | 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 | (Handler, To_Dictionary (Input, Handler.Vlen_Verbatim), Data_List, Method); when Dict_Sources.Text_List => declare Needed : constant Integer := Handler.Dict_Size - Natural (Handler.Forced_Words.Length); Selected, Pending : String_Lists.List; First : Dictionary_Entry := Dictionary_Entry'First; begin if Needed <= 0 then for Word of reverse Handler.Forced_Words loop Selected.Prepend (Word); if Positive (Selected.Length) = Handler.Dict_Size then return To_Dictionary (Selected, Handler.Vlen_Verbatim); end if; end loop; end if; Simple_Dictionary_And_Pending (Make_Word_Counter (Handler, Input), Needed, Selected, Pending, Method, Handler.Max_Pending); for Word of reverse Handler.Forced_Words loop Selected.Prepend (Word); First := Dictionary_Entry'Succ (First); end loop; return Optimize_Dictionary (To_Dictionary (Selected, Handler.Vlen_Verbatim), First, Pending, Input, Handler.Job_Count, Method); end; when Dict_Sources.Unoptimized_Text_List => |
︙ | ︙ |