-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathharry.man
1096 lines (1096 loc) · 44.9 KB
/
harry.man
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
.\" Automatically generated by Pod::Man 2.27 (Pod::Simple 3.28)
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings. \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote. \*(C+ will
.\" give a nicer C++. Capital omega is used to do unbreakable dashes and
.\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff,
.\" nothing in troff, for use with C<>.
.tr \(*W-
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
. ds -- \(*W-
. ds PI pi
. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
. ds L" ""
. ds R" ""
. ds C` ""
. ds C' ""
'br\}
.el\{\
. ds -- \|\(em\|
. ds PI \(*p
. ds L" ``
. ds R" ''
. ds C`
. ds C'
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
.ie \n(.g .ds Aq \(aq
.el .ds Aq '
.\"
.\" If the F register is turned on, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD. Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.\"
.\" Avoid warning from groff about undefined register 'F'.
.de IX
..
.nr rF 0
.if \n(.g .if rF .nr rF 1
.if (\n(rF:(\n(.g==0)) \{
. if \nF \{
. de IX
. tm Index:\\$1\t\\n%\t"\\$2"
..
. if !\nF==2 \{
. nr % 0
. nr F 2
. \}
. \}
.\}
.rr rF
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear. Run. Save yourself. No user-serviceable parts.
. \" fudge factors for nroff and troff
.if n \{\
. ds #H 0
. ds #V .8m
. ds #F .3m
. ds #[ \f1
. ds #] \fP
.\}
.if t \{\
. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
. ds #V .6m
. ds #F 0
. ds #[ \&
. ds #] \&
.\}
. \" simple accents for nroff and troff
.if n \{\
. ds ' \&
. ds ` \&
. ds ^ \&
. ds , \&
. ds ~ ~
. ds /
.\}
.if t \{\
. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
. \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
. \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
. \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
. ds : e
. ds 8 ss
. ds o a
. ds d- d\h'-1'\(ga
. ds D- D\h'-1'\(hy
. ds th \o'bp'
. ds Th \o'LP'
. ds ae ae
. ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "HARRY 1"
.TH HARRY 1 "2016-12-23" "Harry 0.4.3" "User Manual"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
.nh
.SH "NAME"
harry \- A tool for measuring string similarity
.SH "SYNOPSIS"
.IX Header "SYNOPSIS"
\&\fBharry\fR [\fBoptions\fR] [\fB\-c\fR \fIconfig\fR] \fIinput\fR [\fIinput\fR] \fIoutput\fR
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
\&\fBharry\fR is a small tool for measuring the similarity of strings. The tool
supports common distance and kernel functions for strings as well as some
exotic similarity measures. The focus of \fBharry\fR lies on implicit
similarity measures, that is, comparison functions that do not give rise to
an explicit vector space. Examples of such similarity measures are the
Levenshtein distance, the Jaro-Winkler distance or the normalized
compression distance.
.PP
During operation \fBharry\fR loads a set of strings from \fIinput\fR, computes the
specified similarity measure and writes a matrix of similarity values to
\&\fIoutput\fR. If one \fIinput\fR is given, \fBharry\fR computes the similarities
between all strings in \fIinput\fR. If two \fIinput\fR sources are provided,
\&\fBharry\fR computes only the similarities between the two sources. The
similarity measure can be computed based on the granularity of bits, bytes
as well as tokens contained in the strings. The configuration of this
process, such as the input format, the similarity measure and the output
format, are specified in the file \fIconfig\fR and can be additionally refined
using command-line options.
.PP
.Vb 7
\& .\-\-\-\-\-\-\-\-\-. .\-\-\-\-\-\-\-\-\-\-.
\& | lines | | text |
\& | dir | \e .\-\-\-\-\-\-\-\-\-. / | libsvm |
\& | stdin | \-\- | Harry | \-\- | matlab |
\& | ... | / \*(Aq\-\-\-\-\-\-\-\-\-\*(Aq \e | ... |
\& \*(Aq\-\-\-\-\-\-\-\-\-\*(Aq \*(Aq\-\-\-\-\-\-\-\-\-\-\*(Aq
\& Input Similarity Measures Output
.Ve
.PP
\&\fBharry\fR is implemented using OpenMP, such that the computation time for a
set of strings scales linear with the number of available \s-1CPU\s0 cores.
Moreover, efficient implementations of several similarity measures,
effective caching of similarity values and low-overhead locking further
speedup the computation.
.PP
\&\fBharry\fR complements the tool \fBsally\fR(1) that embeds strings in a vector
space and allows computing vectorial similarity measures, such as the cosine
distance and the bag-of-words kernel.
.SH "CONFIGURATION"
.IX Header "CONFIGURATION"
The configuration of \fBharry\fR is provided by a configuration file. This
file is structured into the three sections \fBinput\fR, \fBmeasures\fR and
\&\fBoutput\fR, which define the parameters of the input format, the similarity
measures and the output format, respectively. If no configuration file is
provided, \fBharry\fR resorts to a default configuration. This default
configuration can be printed using the command-line option \fB\-D\fR (see
\&\fB\s-1OPTIONS\s0\fR).
.SS "Input formats"
.IX Subsection "Input formats"
\&\fBharry\fR supports different formats for reading sets of strings, which may
range from plain files to directories and other structured resources. The
input format is specified in the configuration file of \fBharry\fR, but can
also be defined on the command line using the option \fB\-i\fR (see \fB\s-1OPTIONS\s0\fR).
Following is a list of supported input formats:
.IP "\fBinput = {\fR" 4
.IX Item "input = {"
.RS 4
.PD 0
.ie n .IP "\fBinput_format = ""lines"";\fR" 4
.el .IP "\fBinput_format = ``lines'';\fR" 4
.IX Item "input_format = lines;"
.PD
This parameter specifies the input format.
.RS 4
.ie n .IP "\fI""lines""\fR" 14
.el .IP "\fI``lines''\fR" 14
.IX Item "lines"
The input strings are available as lines in a text file. The name of the
file is given as \fIinput\fR to \fBharry\fR. The lines need to be separated by
newline and may not contain the \s-1NUL\s0 character. Labels can be extracted from
each line using a regular expression (see \fBlines_regex\fR).
.ie n .IP "\fI""stdin""\fR" 14
.el .IP "\fI``stdin''\fR" 14
.IX Item "stdin"
The input strings are provided on standard input (stdin) as text lines. The
lines need to be separated by newline and may not contain the \s-1NUL\s0 character.
Labels can be extracted from each line using a regular expression (see
\&\fBlines_regex\fR). This input format is also enabled when \fIinput\fR is set to
\&\fI\-\fR, otherwise \fIinput\fR is ignored.
.ie n .IP "\fI""dir""\fR" 14
.el .IP "\fI``dir''\fR" 14
.IX Item "dir"
The input strings are available as binary files in a directory and the
name of the directory is given as \fIinput\fR to \fBharry\fR. The suffixes
of the files are used as labels for the strings.
.ie n .IP "\fI""arc""\fR" 14
.el .IP "\fI``arc''\fR" 14
.IX Item "arc"
The input strings are available as binary files in a compressed archive,
such as a zip or tgz archive. The name of the archive is given as \fIinput\fR
to \fBharry\fR. The suffixes of the files are used as labels for the strings.
.ie n .IP "\fI""fasta""\fR" 14
.el .IP "\fI``fasta''\fR" 14
.IX Item "fasta"
The input strings are available in \s-1FASTA\s0 format. The name of the file is
given as \fIinput\fR to \fBharry\fR. Labels can be extracted from the description
of each sequence using a regular expression (see \fBfasta_regex\fR). Comments
are allowed if they are preceded by either ';' or '>'.
.ie n .IP "\fI""raw""\fR" 14
.el .IP "\fI``raw''\fR" 14
.IX Item "raw"
The input strings are provided on standard input (stdin) in \*(L"raw\*(R" format.
This input module is designed to efficiently interface with other
environments. The binary format for strings has the form
.Sp
.Vb 3
\& | len (uint32) | array (uint8) ... |
\& | len (uint32) | array (uint8) ... |
\& | ...
.Ve
.Sp
where \fIlen\fR is a 32\-bit unsigned integer in host byte order indicating the
length of the following \fIarray\fR containing the string data in bytes.
Labels cannot be extracted from this representation. This input format is
also enabled when \fIinput\fR is set to \fI=\fR, otherwise \fIinput\fR is ignored.
.RE
.RS 4
.RE
.IP "\fBchunk_size = 256;\fR" 4
.IX Item "chunk_size = 256;"
To enable an efficient processing of large data sets, \fBharry\fR loads strings
in chunks. This parameter defines the number of strings in one of these
chunks. Depending on the lengths and type of the strings, this parameter
can be adjusted to improve loading times.
.IP "\fBdecode_str = false;\fR" 4
.IX Item "decode_str = false;"
If this parameter is set to \fItrue\fR, \fBharry\fR automatically decodes strings
that contain URI-encoded characters. That is, substrings of the form \f(CW%XX\fR
are replaced with the byte corresponding to the hexadecimal number \s-1XX.\s0
.ie n .IP "\fBfasta_regex = "" (\e\e+|\-)?[0\-9]+"";\fR" 4
.el .IP "\fBfasta_regex = `` (\e\e+|\-)?[0\-9]+'';\fR" 4
.IX Item "fasta_regex = (+|-)?[0-9]+;"
The \s-1FASTA\s0 format allows to equip each string with a short description. In
several data sets this description contains a numerical label which can be
used for supervised learning tasks. The parameter defines a regular
expression that matches these numerical labels, such as +1 and \-1.
.ie n .IP "\fBlines_regex = ""^(\e\e+|\-)?[0\-9]+"";\fR" 4
.el .IP "\fBlines_regex = ``^(\e\e+|\-)?[0\-9]+'';\fR" 4
.IX Item "lines_regex = ^(+|-)?[0-9]+;"
If the strings are available as text lines, the parameter can be used to
extract a numerical label from the strings. The parameter is a regular
expression matching labels, such as +1 and \-1.
.IP "\fBreverse_str = false;\fR" 4
.IX Item "reverse_str = false;"
If this parameter is set to \fItrue\fR, the characters of all input strings
will be reversed. Such reversing might help in situations where the reading
direction of the input strings is unspecified.
.ie n .IP "\fBstoptoken_file = """";\fR" 4
.el .IP "\fBstoptoken_file = ``'';\fR" 4
.IX Item "stoptoken_file = """";"
Stop tokens (irrelevant tokens) can be filtered from the strings by providing
a file containing these tokens; one per line. Non-printable characters can
be escaped using \s-1URI\s0 encoding (%XX). Stop tokens can only be filtered, if
the \fBgranularity\fR is set to \fItokens\fR.
.IP "\fBsoundex = false;\fR" 4
.IX Item "soundex = false;"
All tokens in the strings are mapped to the soundex index. For example,
\&\*(L"Pfister\*(R" is mapped to \*(L"P236\*(R" and \*(L"Jackson\*(R" to \*(L"J250\*(R". The soundex index
has been originally designed for comparing names, however, in \fBharry\fR it
can be applied to all sorts of tokens, if they are composed of alphabetic
letters. Punctation characters are ignored and thus the string \*(L"Hey, I am
here with Harry!\*(R", gets mapped to \*(L"H000 I000 A500 H600 W300 H600\*(R".
.RE
.RS 4
.RE
.IP "\fB};\fR" 4
.IX Item "};"
.SS "Similarity Measures"
.IX Subsection "Similarity Measures"
\&\fBharry\fR supports different similarity measures for comparing string,
including common distance and kernel functions. The similarity measure can
be specified in the configuration file as well as on the command line using
the option \fB\-m\fR (see \fB\s-1OPTIONS\s0\fR). The name of each similarity measure is
prefixed by its type (\fIdist_\fR for distances, \fIkern_\fR for kernels and
\&\fIsim_\fR for similarity measures). For convenience, this prefix can be
omitted. Moreover, the names of some similarity measures are aliased,
for example, the normalized compression distance is available as
\&\fIdist_compression\fR and \fIdist_ncd\fR.
.PP
Parameters of the similarity measures are organized in individual
configuration groups. For instance, parameters of the Levenshtein distance
are defined in the group \fBdist_levenshtein\fR, while parameters for the Jaro
and Jaro-Winkler distance are given in \fBdist_jarowinkler\fR.
.IP "\fBmeasures = {\fR" 4
.IX Item "measures = {"
.RS 4
.PD 0
.ie n .IP "\fBtype = ""dist_levenshtein""\fR" 4
.el .IP "\fBtype = ``dist_levenshtein''\fR" 4
.IX Item "type = dist_levenshtein"
.PD
The parameter \fBtype\fR specifies the similarity measure that is used for
comparing the strings. Supported similarity measures are:
.RS 4
.ie n .IP "\fI""dist_hamming""\fR" 4
.el .IP "\fI``dist_hamming''\fR" 4
.IX Item "dist_hamming"
Hamming distance. See configuration group \fBdist_hamming\fR.
.ie n .IP "\fI""dist_levenshtein""\fR, \fI""dist_edit""\fR" 4
.el .IP "\fI``dist_levenshtein''\fR, \fI``dist_edit''\fR" 4
.IX Item "dist_levenshtein, dist_edit"
Levenshtein distance. See configuration group \fBdist_levenshtein\fR.
.ie n .IP "\fI""dist_damerau""\fR" 4
.el .IP "\fI``dist_damerau''\fR" 4
.IX Item "dist_damerau"
Damerau-Levenshtein distance. See configuration group \fBdist_damerau\fR.
.ie n .IP "\fI""dist_osa""\fR" 4
.el .IP "\fI``dist_osa''\fR" 4
.IX Item "dist_osa"
Optimal string alignment (\s-1OSA\s0) distance. See configuration group \fBdist_osa\fR.
.ie n .IP "\fI""dist_jaro""\fR" 4
.el .IP "\fI``dist_jaro''\fR" 4
.IX Item "dist_jaro"
Jaro distance. See configuration group \fBdist_jarowinkler\fR.
.ie n .IP "\fI""dist_jarowinkler""\fR" 4
.el .IP "\fI``dist_jarowinkler''\fR" 4
.IX Item "dist_jarowinkler"
Jaro-Winkler distance. See configuration group \fBdist_jarowinkler\fR.
.ie n .IP "\fI""dist_lee""\fR" 4
.el .IP "\fI``dist_lee''\fR" 4
.IX Item "dist_lee"
Lee distance. See configuration group \fBdist_lee\fR
.ie n .IP "\fI""dist_compression""\fR, \fI""dist_ncd""\fR" 4
.el .IP "\fI``dist_compression''\fR, \fI``dist_ncd''\fR" 4
.IX Item "dist_compression, dist_ncd"
Normalized compression distance (\s-1NCD\s0). See configuration group \fBdist_compression\fR.
.ie n .IP "\fI""dist_bag""\fR" 4
.el .IP "\fI``dist_bag''\fR" 4
.IX Item "dist_bag"
Bag distance. See configuration group \fBdist_bag\fR.
.ie n .IP "\fI""dist_kernel""\fR" 4
.el .IP "\fI``dist_kernel''\fR" 4
.IX Item "dist_kernel"
Kernel substitution distance. See configuration group \fBdist_kernel\fR.
.ie n .IP "\fI""kern_subsequence""\fR, \fI""kern_ssk""\fR" 4
.el .IP "\fI``kern_subsequence''\fR, \fI``kern_ssk''\fR" 4
.IX Item "kern_subsequence, kern_ssk"
Subsequence kernel (\s-1SSK\s0). See configuration group \fBkern_subsequence\fR.
.ie n .IP "\fI""kern_spectrum""\fR, \fI""kern_ngram""\fR" 4
.el .IP "\fI``kern_spectrum''\fR, \fI``kern_ngram''\fR" 4
.IX Item "kern_spectrum, kern_ngram"
Spectrum kernel (also n\-gram kernel). See configuration group \fBkern_spectrum\fR.
.ie n .IP "\fI""kern_wdegree""\fR, \fI""kern_wdk""\fR" 4
.el .IP "\fI``kern_wdegree''\fR, \fI``kern_wdk''\fR" 4
.IX Item "kern_wdegree, kern_wdk"
Weighted-degree kernel (\s-1WDK\s0) with shifts. See configuration group \fBkern_wdegree\fR.
.ie n .IP "\fI""kern_distance""\fR, \fI""kern_dsk""\fR" 4
.el .IP "\fI``kern_distance''\fR, \fI``kern_dsk''\fR" 4
.IX Item "kern_distance, kern_dsk"
Distance substitution kernel (\s-1DSK\s0). See configuration group \fBkern_distance\fR.
.ie n .IP "\fI""sim_simpson""\fR" 4
.el .IP "\fI``sim_simpson''\fR" 4
.IX Item "sim_simpson"
Simpson coefficient. See configuration group \fBsim_coefficient\fR.
.ie n .IP "\fI""sim_jaccard""\fR" 4
.el .IP "\fI``sim_jaccard''\fR" 4
.IX Item "sim_jaccard"
Jaccard coefficient. See configuration group \fBsim_coefficient\fR.
.ie n .IP "\fI""sim_braun""\fR" 4
.el .IP "\fI``sim_braun''\fR" 4
.IX Item "sim_braun"
Braun-Blanquet coefficient. See configuration group \fBsim_coefficient\fR.
.ie n .IP "\fI""sim_dice""\fR, \fI""sim_czekanowski""\fR" 4
.el .IP "\fI``sim_dice''\fR, \fI``sim_czekanowski''\fR" 4
.IX Item "sim_dice, sim_czekanowski"
Dice-coefficient (Czekanowsi coefficient) See configuration group
\&\fBsim_coefficient\fR.
.ie n .IP "\fI""sim_sokal""\fR, \fI""sim_anderberg""\fR" 4
.el .IP "\fI``sim_sokal''\fR, \fI``sim_anderberg''\fR" 4
.IX Item "sim_sokal, sim_anderberg"
Sokal-Sneath coefficient (Anderberg coefficient). See configuration group
\&\fBsim_coefficient\fR.
.ie n .IP "\fI""sim_kulczynski""\fR" 4
.el .IP "\fI``sim_kulczynski''\fR" 4
.IX Item "sim_kulczynski"
Second Kulczynski coefficient. See configuration group \fBsim_coefficient\fR.
.ie n .IP "\fI""sim_otsuka""\fR, \fI""sim_ochiai""\fR" 4
.el .IP "\fI``sim_otsuka''\fR, \fI``sim_ochiai''\fR" 4
.IX Item "sim_otsuka, sim_ochiai"
Otsuka coefficient (Ochiai coefficient). See configuration group
\&\fBsim_coefficient\fR.
.RE
.RS 4
.RE
.ie n .IP "\fBgranularity = ""bytes"";\fR" 4
.el .IP "\fBgranularity = ``bytes'';\fR" 4
.IX Item "granularity = bytes;"
This parameter controls the granularity of strings. It can be set to either
\&\fIbits\fR, \fIbytes\fR or \fItokens\fR. Depending in the granularity a string is
considered as a sequence of bits, bytes or tokens, which results in different
similarity values during comparison.
.ie n .IP "\fBtoken_delim = """";\fR" 4
.el .IP "\fBtoken_delim = ``'';\fR" 4
.IX Item "token_delim = """";"
The parameter \fBtoken_delim\fR defines characters for delimiting tokens in
strings, for example \*(L" \f(CW%0a\fR%0d\*(R". It is only considered, if the granularity
is set to \fItokens\fR, otherwise it is ignored.
.IP "\fBnum_threads = 0;\fR" 4
.IX Item "num_threads = 0;"
The parameter \fBnum_threads\fR sets the number of threads for the calculation
of the similarity measures. If set 0, \fBharry\fR determines the number of
available \s-1CPU\s0 cores using OpenMP and sets the number of threads accordingly.
.IP "\fBcache_size = 256;\fR" 4
.IX Item "cache_size = 256;"
The parameter \fBcache_size\fR specifies the maximum size of the internal cache
in megabytes (Mb). The general-purpose cache is used to speed up
computations of \fBharry\fR for some similarity measures.
.IP "\fBglobal_cache = false;\fR" 4
.IX Item "global_cache = false;"
By default \fBharry\fR caches only internal computations. If this parameter is
set to \fItrue\fR, all similarity values are stored in the cache. This feature
should only be enabled if many of the compared strings are identical and
thus caching similarity values can provide benefits.
.ie n .IP "\fBcol_range = """";\fR" 4
.el .IP "\fBcol_range = ``'';\fR" 4
.IX Item "col_range = """";"
.PD 0
.ie n .IP "\fBrow_range = """";\fR" 4
.el .IP "\fBrow_range = ``'';\fR" 4
.IX Item "row_range = """";"
.PD
These two parameters control which strings are used for computing the matrix
of similarity values. \fBcol_range\fR defines a range of indices on the
columns and \fBrow_range\fR on the rows of the matrix. The format of the
ranges is similar to indexing of Python arrays: A range is given by
"\fIstart\fR:\fIend\fR", where \fIstart\fR defines the index of the first string and
\&\fIend\fR defines the index after the last string. For example, \fI\*(L"0:4\*(R"\fR
selects the strings at index 0, 1, 2, and 3. If the start or end index is
omitted, the minimum or maximum value is substituted, respectively. For
example, \fI\*(L":4\*(R"\fR selects strings starting from the index \fI0\fR and \fI\*(L":\*(R"\fR
chooses all strings. If the end index is negative, it is substracted from
the maximum index, that is, \fI\*(L":\-1\*(R"\fR selects all strings except for the last
one.
.Sp
The parameters \fBcol_range\fR and \fBrow_range\fR are ignore if two input sources
are given on the command line.
.ie n .IP "\fBsplit = """";\fR" 4
.el .IP "\fBsplit = ``'';\fR" 4
.IX Item "split = """";"
To ease the computation of large similarity matrices, \fBharry\fR supports
automatically splitting a matrix into blocks. This splitting is defined by
a string of the form "\fIblocks\fR:\fIidx\fR", where \fIblocks\fR defines the number
of blocks and \fIidx\fR the index of the block to compute. The matrix is
splitted across the y\-axis. For many output formats the blocks can be
simply concatenated to get the original matrix.
.Sp
The parameter \fBsplit\fR is ignore if two input sources are given on the
command line.
.IP "\fBdist_hamming = {\fR" 4
.IX Item "dist_hamming = {"
This module implements the Hamming distance (see Hamming, 1950). The
runtime complexity of a comparison is linear in the length of the strings.
If the compared strings have unequal length, the length difference is added
to the distance. The following parameters are supported:
.RS 4
.ie n .IP "\fBnorm = ""none"";\fR" 4
.el .IP "\fBnorm = ``none'';\fR" 4
.IX Item "norm = none;"
This parameter specifies the normalization of the distance. Supported values
are \fI\*(L"none\*(R"\fR for no normalization, \fI\*(L"min\*(R"\fR for normalization on the
minimum length, \fI\*(L"max\*(R"\fR for normalization on the maximum length, \fI\*(L"avg\*(R"\fR
for normalization on the average length of the compared strings.
.RE
.RS 4
.RE
.IP "\fB};\fR" 4
.IX Item "};"
.PD 0
.IP "\fBdist_levenshtein = {\fR" 4
.IX Item "dist_levenshtein = {"
.PD
This module implements the Levenshtein distance (see Levenshtein, 1966). The
runtime complexity of a comparison is quadratic in the length of the
strings. The following parameters are supported:
.RS 4
.ie n .IP "\fBnorm = ""none"";\fR" 4
.el .IP "\fBnorm = ``none'';\fR" 4
.IX Item "norm = none;"
This parameter specifies the normalization of the distance. Supported values
are \fI\*(L"none\*(R"\fR for no normalization, \fI\*(L"min\*(R"\fR for normalization on the
minimum length, \fI\*(L"max\*(R"\fR for normalization on the maximum length, \fI\*(L"avg\*(R"\fR
for normalization on the average length of the compared
.IP "\fBcost_ins = 1.0;\fR" 4
.IX Item "cost_ins = 1.0;"
.PD 0
.IP "\fBcost_del = 1.0;\fR" 4
.IX Item "cost_del = 1.0;"
.IP "\fBcost_sub = 1.0;\fR" 4
.IX Item "cost_sub = 1.0;"
.PD
The computation of the distance can be adapted using three parameters
defining the cost for an insertion, deletion and substitution,
respectively. The default costs are \fI1.0\fR for each operation.
.RE
.RS 4
.RE
.IP "\fB};\fR" 4
.IX Item "};"
.PD 0
.IP "\fBdist_damerau = {\fR" 4
.IX Item "dist_damerau = {"
.PD
This module implements the Damerau-Levenshtein distance (see Damerau, 1964).
The runtime and space complexity of a comparison is quadratic in the length
of the strings. The following parameters are supported:
.RS 4
.ie n .IP "\fBnorm = ""none"";\fR" 4
.el .IP "\fBnorm = ``none'';\fR" 4
.IX Item "norm = none;"
This parameter specifies the normalization of the distance. Supported values
are \fI\*(L"none\*(R"\fR for no normalization, \fI\*(L"min\*(R"\fR for normalization on the
minimum length, \fI\*(L"max\*(R"\fR for normalization on the maximum length, \fI\*(L"avg\*(R"\fR
for normalization on the average length of the compared strings.
.IP "\fBcost_ins = 1.0;\fR" 4
.IX Item "cost_ins = 1.0;"
.PD 0
.IP "\fBcost_del = 1.0;\fR" 4
.IX Item "cost_del = 1.0;"
.IP "\fBcost_sub = 1.0;\fR" 4
.IX Item "cost_sub = 1.0;"
.IP "\fBcost_tra = 1.0;\fR" 4
.IX Item "cost_tra = 1.0;"
.PD
The computation of the distance can be adapted using four parameters
defining the cost for an insertion, deletion, substitution and
transposition, respectively. The default costs are \fI1.0\fR for each
operation.
.RE
.RS 4
.RE
.IP "\fB};\fR" 4
.IX Item "};"
.PD 0
.IP "\fBdist_osa = {\fR" 4
.IX Item "dist_osa = {"
.PD
This module implements the optimal string alignment (\s-1OSA\s0) distance, which is
often confused with the Damerau-Levenshtein distance. The difference
between the two is that the \s-1OSA\s0 distance computes the number of edit
operations needed to make the strings equal under the condition that no
substring is edited more than once. (see the Wikipedia article on the
Damerau-Levenshtein distance). The runtime and space complexity of a
comparison is quadratic in the length of the strings. The following
parameters are supported:
.RS 4
.ie n .IP "\fBnorm = ""none"";\fR" 4
.el .IP "\fBnorm = ``none'';\fR" 4
.IX Item "norm = none;"
This parameter specifies the normalization of the distance. Supported values
are \fI\*(L"none\*(R"\fR for no normalization, \fI\*(L"min\*(R"\fR for normalization on the
minimum length, \fI\*(L"max\*(R"\fR for normalization on the maximum length, \fI\*(L"avg\*(R"\fR
for normalization on the average length of the compared strings.
.IP "\fBcost_ins = 1.0;\fR" 4
.IX Item "cost_ins = 1.0;"
.PD 0
.IP "\fBcost_del = 1.0;\fR" 4
.IX Item "cost_del = 1.0;"
.IP "\fBcost_sub = 1.0;\fR" 4
.IX Item "cost_sub = 1.0;"
.IP "\fBcost_tra = 1.0;\fR" 4
.IX Item "cost_tra = 1.0;"
.PD
The computation of the distance can be adapted using four parameters
defining the cost for an insertion, deletion, substitution and
transposition, respectively. The default costs are \fI1.0\fR for each
operation.
.RE
.RS 4
.RE
.IP "\fB};\fR" 4
.IX Item "};"
.PD 0
.IP "\fBdist_jarowinkler = {\fR" 4
.IX Item "dist_jarowinkler = {"
.PD
This module implements the Jaro distance (Jaro, 1989) and the Jaro-Winkler
distance (Winkler, 1990). In contrast to the original formulation, a valid
distance function is implemented, where similar strings yield a low value
and dissimilar strings a high value. The runtime complexity of a comparison
is quadratic in the length of the strings. The following parameters are
supported:
.RS 4
.IP "\fBscaling = 0.1;\fR" 4
.IX Item "scaling = 0.1;"
If this parameter is set to \fI0\fR, the original Jaro distance is returned,
otherwise the Jaro-Winkler distance is calculated. This distance uses a
\&\fBscaling\fR which gives more favorable ratings to strings that match from
the beginning up to 4 symbols. The default value is \fI0.1\fR.
.RE
.RS 4
.RE
.IP "\fB};\fR" 4
.IX Item "};"
.PD 0
.IP "\fBdist_lee = {\fR" 4
.IX Item "dist_lee = {"
.PD
This module implements the Lee distance (Lee, 1958) for strings. The runtime
complexity of a comparison is linear in the length of the strings. If the
compared strings have unequal length, the remaining symbols of the longer
string are added to the distance. The following parameters are supported:
.RS 4
.IP "\fBmin_sym = 0;\fR =item \fBmax_sym = 255;\fR" 4
.IX Item "min_sym = 0; =item max_sym = 255;"
These parameters specify the range of symbols, that is, the minimum and
maximum value of a symbol in all strings. If the strings consist of bytes,
\&\fBmin_sym\fR is typically set to \fI0\fR and \fBmax_sym\fR to \fI255\fR. For printable
characters the range can be further narrowed to \fI32\fR and \fI126\fR. If tokens
are analyzed using the parameter \fBtoken_delim\fR, \fBmin_sym\fR must be set to 0
and \fBmax_sym\fR to \fI65535\fR, as the tokens are mapped to integers in this
range.
.RE
.RS 4
.RE
.IP "\fB};\fR" 4
.IX Item "};"
.PD 0
.IP "\fBdist_compression = {\fR" 4
.IX Item "dist_compression = {"
.PD
This module implements the normalized compression distance for strings
(Cilibrasi and Vitanyi, 2005). The distance is \*(L"symmetrized\*(R". The
compression is implemented using \fBzlib\fR. Note that the comparison of
strings highly depends on the characteristics of the compressor (Cebrian et
al., 2005). The strings should not be longer than 16 kilobytes, such that
two strings fit into the window of \fBzlib\fR. The runtime complexity of a
comparison is linear in the length of the strings, though with a large
constant factor. The following parameters are supported:
.RS 4
.IP "\fBlevel = 9;\fR" 4
.IX Item "level = 9;"
This parameter defines the compression level used by \fBzlib\fR and must be
between \fI1\fR and \fI9\fR, where \fI1\fR gives the best speed and \fI9\fR the best
compression. See \fB\f(BIzlib\fB\|(3)\fR
.RE
.RS 4
.RE
.IP "\fB};\fR" 4
.IX Item "};"
.PD 0
.IP "\fBdist_bag = {\fR" 4
.IX Item "dist_bag = {"
.PD
This module implements the bag distance (see Bartolini et al., 2002). The
distance approximates and lower bounds the Levenshtein distance. The
runtime complexity of a comparison is linear in the length of the strings.
The following parameters are supported:
.RS 4
.ie n .IP "\fBnorm = ""none"";\fR" 4
.el .IP "\fBnorm = ``none'';\fR" 4
.IX Item "norm = none;"
This parameter specifies the normalization of the distance. Supported values
are \fI\*(L"none\*(R"\fR for no normalization, \fI\*(L"min\*(R"\fR for normalization on the
minimum length, \fI\*(L"max\*(R"\fR for normalization on the maximum length, \fI\*(L"avg\*(R"\fR
for normalization on the average length of the compared strings.
.RE
.RS 4
.RE
.IP "\fB};\fR" 4
.IX Item "};"
.PD 0
.IP "\fBdist_kernel = {\fR" 4
.IX Item "dist_kernel = {"
.PD
This module implements a kernel-based distance, that is, a distance is
computed given a kernel function for strings. The specified kernel function
is mapped to a Euclidean distance using simple geometry. The runtime
complexity depends on the kernel function. The following parameters are
supported:
.RS 4
.ie n .IP "\fBkern = ""kern_wdegree"";\fR" 4
.el .IP "\fBkern = ``kern_wdegree'';\fR" 4
.IX Item "kern = kern_wdegree;"
This parameter selects the kernel function to use for the distance. The
kernel is mapped to a Euclidean distance using simple geometry.
.ie n .IP "\fBnorm = ""none"";\fR" 4
.el .IP "\fBnorm = ``none'';\fR" 4
.IX Item "norm = none;"
This parameter specifies the normalization of the kernel. Supported values
are \fI\*(L"none\*(R"\fR for no normalization and \fI\*(L"l2\*(R"\fR for the standard l2
normalization of kernels.
.IP "\fBsquared = true;\fR" 4
.IX Item "squared = true;"
The module computes a Euclidean distance from the given kernel function. If
this parameter is enabled a squared Euclidean distance is returned which
is slightly faster due to the omitted root computation.
.RE
.RS 4
.RE
.IP "\fB};\fR" 4
.IX Item "};"
.PD 0
.IP "\fBkern_wdegree = {\fR" 4
.IX Item "kern_wdegree = {"
.PD
This module implements the weighted-degree kernel with shifts (Sonnenburg et
al., 2007). The runtime complexity is linear in the length of the strings.
If the strings have unequal length, the remaining symbols of the longer
string are ignored, in accordance with the kernel definition. The following
parameters are supported:
.RS 4
.ie n .IP "\fBnorm = ""none"";\fR" 4
.el .IP "\fBnorm = ``none'';\fR" 4
.IX Item "norm = none;"
This parameter specifies the normalization of the kernel. Supported values
are \fI\*(L"none\*(R"\fR for no normalization and \fI\*(L"l2\*(R"\fR for the standard l2
normalization of kernels.
.IP "\fBdegree = 3;\fR" 4
.IX Item "degree = 3;"
This parameter specifies the degree of the kernel, that is, the length of
considered k\-mers/k\-grams. As the kernel computation is implicit, the
k\-mers are not extracted but implicitly counted by blocks of matching symbols.
.IP "\fBshift = 0;\fR" 4
.IX Item "shift = 0;"
To compensate noise in the strings, the kernel can be computed with \*(L"shifts\*(R".
The strings are compared multiple times with different positive and negative
offsets up to \fBshift\fR symbols. The different kernel values are added. The
runtime complexity is increased by twice the value of \fBshift\fR.
.RE
.RS 4
.RE
.IP "\fB};\fR" 4
.IX Item "};"
.PD 0
.IP "\fBkern_subsequence = {\fR" 4
.IX Item "kern_subsequence = {"
.PD
This module implements the subsequence kernel (Lodhi et al., 2002). The
runtime complexity is quadratic in the length of the strings. The following
parameters are supported:
.RS 4
.ie n .IP "\fBnorm = ""none"";\fR" 4
.el .IP "\fBnorm = ``none'';\fR" 4
.IX Item "norm = none;"
This parameter specifies the normalization of the kernel. Supported values
are \fI\*(L"none\*(R"\fR for no normalization and \fI\*(L"l2\*(R"\fR for the standard l2
normalization of kernels.
.IP "\fBlength = 3;\fR" 4
.IX Item "length = 3;"
This parameter specifies the length of subsequence to consider.
.IP "\fBlambda = 0.1;\fR" 4
.IX Item "lambda = 0.1;"
This parameter is a weighting term for gaps within subsequences.
.RE
.RS 4
.RE
.IP "\fB};\fR" 4
.IX Item "};"
.PD 0
.IP "\fBkern_spectrum = {\fR" 4
.IX Item "kern_spectrum = {"
.PD
This module implements the spectrum kernel (Leslie et al., 2002). The
runtime complexity is linear in the length of the strings. The spectrum
kernel is closely related to bag-of-words kernels. Thus, the tool
\&\fB\f(BIsally\fB\|(1)\fR may be alternatively used to compute the kernel using an
explicit vector space. The following parameters are supported by the
implementation:
.RS 4
.ie n .IP "\fBnorm = ""none"";\fR" 4
.el .IP "\fBnorm = ``none'';\fR" 4
.IX Item "norm = none;"
This parameter specifies the normalization of the kernel. Supported values
are \fI\*(L"none\*(R"\fR for no normalization and \fI\*(L"l2\*(R"\fR for the standard l2
normalization of kernels.
.IP "\fBlength = 3;\fR" 4
.IX Item "length = 3;"
This parameter specifies the length of k\-mers/k\-grams to consider.
.RE
.RS 4
.RE
.IP "\fB};\fR" 4
.IX Item "};"
.PD 0
.IP "\fBkern_distance = {\fR" 4
.IX Item "kern_distance = {"
.PD
This module implements distance substitution kernels (Haasdonk and Bahlmann,
2004). The empty string is considered the origin of the underlying implicit
vector space. The runtime complexity depends on the selected distance
function. The following parameters are supported:
.RS 4
.ie n .IP "\fBdist = ""dist_bag"";\fR" 4
.el .IP "\fBdist = ``dist_bag'';\fR" 4
.IX Item "dist = dist_bag;"
This parameter selects the distance function to use for the kernel.
Depending on the type of the substitution and the selected distance, the
kernel might not be positive semi-definite.
.ie n .IP "\fBtype = ""linear"";\fR" 4
.el .IP "\fBtype = ``linear'';\fR" 4
.IX Item "type = linear;"
Four types of substitutions can be selected for creating a kernel from a
distance function: \fI\*(L"linear\*(R"\fR, \fI\*(L"poly\*(R"\fR, \fI\*(L"neg\*(R"\fR and \fI\*(L"rbf\*(R"\fR. For a
detailed explanation of each substitution see the paper by Haasdonk and
Bahlmann (2004).
.ie n .IP "\fBnorm = ""none"";\fR" 4
.el .IP "\fBnorm = ``none'';\fR" 4
.IX Item "norm = none;"
This parameter specifies the normalization of the kernel. Supported values
are \fI\*(L"none\*(R"\fR for no normalization and \fI\*(L"l2\*(R"\fR for the standard l2
normalization of kernels.
.IP "\fBgamma = 1.0;\fR" 4
.IX Item "gamma = 1.0;"
This parameter specifies a scaling factor for the substitution types
\&\fI\*(L"poly\*(R"\fR and \fI\*(L"rbf\*(R"\fR.
.IP "\fBdegree = 1.0;\fR" 4
.IX Item "degree = 1.0;"
This parameter defines a polynomial degree for the substitution types
\&\fI\*(L"poly\*(R"\fR and \fI\*(L"neg\*(R"\fR.
.RE
.RS 4
.RE
.IP "\fB};\fR" 4
.IX Item "};"
.PD 0
.IP "\fBsim_coefficient = {\fR" 4
.IX Item "sim_coefficient = {"
.PD
This module implements several similarity coefficients for strings (see
Cheetham and Hazel, 1969). The runtime complexity of a comparison is linear
in the length of the strings. The following parameters are supported:
.RS 4
.ie n .IP "\fBmatching = ""bin"";\fR" 4
.el .IP "\fBmatching = ``bin'';\fR" 4
.IX Item "matching = bin;"
The parameter specifies how the symbols of the strings are matched. If the
parameter is set to \fI\*(L"bin\*(R"\fR, the symbols are considered as binary attributes
that are either present or not. If the parameter is set to \fI\*(L"cnt\*(R"\fR, the
count of each symbol is considered for the matching.
.RE
.RS 4
.RE
.IP "\fB};\fR" 4
.IX Item "};"
.RE
.RS 4
.RE
.PD 0
.IP "\fB};\fR" 4
.IX Item "};"
.PD
.SS "Output formats"
.IX Subsection "Output formats"
Once strings have been compared, \fBharry\fR stores the similarity values in
one of several common formats, which allows for applying typical tools of
statistics and machine learning to the data. Following is a list of
supported output formats and respective parameters. Additionally, the
output format can be specified using the command-line option \fB\-o\fR (see
\&\fB\s-1OPTIONS\s0\fR).
.IP "\fBoutput = {\fR" 4
.IX Item "output = {"
.RS 4
.PD 0
.ie n .IP "\fBoutput_format = ""text"";\fR" 4
.el .IP "\fBoutput_format = ``text'';\fR" 4
.IX Item "output_format = text;"
.PD
Following is a list of output formats supported by \fBharry\fR:
.RS 4
.ie n .IP "\fI""text""\fR" 14
.el .IP "\fI``text''\fR" 14
.IX Item "text"
The similarity values are stored as plain text.
.ie n .IP "\fI""stdout""\fR" 14
.el .IP "\fI``stdout''\fR" 14
.IX Item "stdout"
The similarity values are written to standard output (stdout) as plain text.
This output format is also enabled when \fIoutput\fR is set to \fI\-\fR, otherwise
\&\fIoutput\fR is ignored.
.ie n .IP "\fI""libsvm""\fR" 14
.el .IP "\fI``libsvm''\fR" 14
.IX Item "libsvm"
The similarity values are stored as precomputed kernel for libsvm.
.ie n .IP "\fI""json""\fR" 14
.el .IP "\fI``json''\fR" 14
.IX Item "json"
The similarity values are stored in \s-1JSON\s0 object.
.ie n .IP "\fI""matlab""\fR" 14
.el .IP "\fI``matlab''\fR" 14
.IX Item "matlab"
The similarity values are stored in Matlab format (version 5).
.ie n .IP "\fI""raw""\fR" 14
.el .IP "\fI``raw''\fR" 14
.IX Item "raw"
The similarity values are written to standard output (stdout) in raw format.
This output module is designed for interfacing with other analysis
environments. The format of the similarity matrix has the following form
.Sp
.Vb 2
\& | rows (uint32) | cols (uint32) |
\& | fsize (uint32) | array (float) ... |
.Ve
.Sp
where \fIrows\fR and \fIcols\fR are unsigned 32\-bit integers specifing the
dimensions of the matrix, \fIfsize\fR is the size of a float in bytes and
\&\fIarray\fR holds the matrix as floats. Indices, labels and sources are not
output. This output format is also enables when \fIoutput\fR is set to \fI=\fR,
otherwise \fIoutput\fR is ignored.
.RE
.RS 4
.RE
.IP "\fBprecision = 0;\fR" 4
.IX Item "precision = 0;"
Precision of the output in terms of decimal places. A precision of \fI0\fR
selects the full single float range for output.
.ie n .IP "\fBseparator = "","";\fR" 4
.el .IP "\fBseparator = ``,'';\fR" 4
.IX Item "separator = ,;"
This parameter defines the separator used in text mode for separating
the similarity values.
.IP "\fBsave_indices = false;\fR" 4
.IX Item "save_indices = false;"
If this parameter is to \fItrue\fR and supported by the output format, the
indices of the strings will be additionally stored.
.IP "\fBsave_labels = false;\fR" 4
.IX Item "save_labels = false;"
If this parameter is to \fItrue\fR and supported by the output format, the
labels of the strings will be additionally stored.
.IP "\fBsave_sources = false;\fR" 4
.IX Item "save_sources = false;"
If this parameter is to \fItrue\fR and supported by the output format, the
sources of the strings will be additionally stored.
.IP "\fBcompress = false;\fR" 4
.IX Item "compress = false;"
If this parameter is set to \fItrue\fR, the output is stored using zlib
compression, which can significantly reduce the required disk space.
Several programs support reading files compressed using zlib.
Alternatively, the tools \fIgzcat\fR\|(1) and \fIgunzip\fR\|(1) can be used to access the
data.
.RE
.RS 4
.RE
.IP "\fB};\fR" 4
.IX Item "};"
.SH "OPTIONS"
.IX Header "OPTIONS"
The configuration of \fBharry\fR can be refined using several command-line
options. Moreover, some parameters of the configuration can be overwritten
on the command line. Following is the list of options:
.SS "I/O options"
.IX Subsection "I/O options"
.Vb 12
\& \-i, \-\-input_format <format> Set input format for strings.
\& \-\-decode_str Enable URI\-decoding of strings.
\& \-\-reverse_str Reverse (flip) all strings.
\& \-\-stoptoken_file <file> Provide a file with stop tokens.
\& \-\-soundex Enable soundex encoding of tokens.
\& \-\-benchmark <seconds> Perform benchmark run.
\& \-o, \-\-output_format <format> Set output format for matrix.
\& \-p, \-\-precision <num> Set precision of output.
\& \-z, \-\-compress Enable zlib compression of output.
\& \-\-save_indices Save indices of strings.
\& \-\-save_labels Save labels of strings.