src/viewer/ascii.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051

/*
   Internal file viewer for the Midnight Commander
   Function for plain view

   Copyright (C) 1994-2024
   Free Software Foundation, Inc.

   Written by:
   Miguel de Icaza, 1994, 1995, 1998
   Janne Kukonlehto, 1994, 1995
   Jakub Jelinek, 1995
   Joseph M. Hinkle, 1996
   Norbert Warmuth, 1997
   Pavel Machek, 1998
   Roland Illig <roland.illig@gmx.de>, 2004, 2005
   Slava Zanko <slavazanko@google.com>, 2009
   Andrew Borodin <aborodin@vmail.ru>, 2009-2022
   Ilia Maslakov <il.smind@gmail.com>, 2009
   Rewritten almost from scratch by:
   Egmont Koblinger <egmont@gmail.com>, 2014

   This file is part of the Midnight Commander.

   The Midnight Commander is free software: you can redistribute it
   and/or modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation, either version 3 of the License,
   or (at your option) any later version.

   The Midnight Commander is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   ------------------------------------------------------------------------------------------------

   The viewer is implemented along the following design principles:

   Goals: Always display simple scripts, double wide (CJK), combining accents and spacing marks
   (often used e.g. in Devanagari) perfectly. Make the arrow keys always work correctly.

   Absolutely non-goal: RTL.

   Terminology:

   - A "paragraph" is the text between two adjacent newline characters. A "line" or "row" is a
   visual row on the screen. In wrap mode, the viewer formats a paragraph into one or more lines.

   - The Unicode glossary <http://www.unicode.org/glossary/> doesn't seem to have a notion of "base
   character followed by zero or more combining characters". The closest matches are "Combining
   Character Sequence" meaning a base character followed by one or more combining characters, or
   "Grapheme" which seems to exclude non-printable characters such as newline. In this file,
   "combining character sequence" (or any obvious abbreviation thereof) means a base character
   followed by zero or more (up to a current limit of 4) combining characters.

   ------------------------------------------------------------------------------------------------

   The parser-formatter is designed to be stateless across paragraphs. This is so that we can walk
   backwards without having to reparse the whole file (although we still need to reparse and
   reformat the whole paragraph, but it's a lot better). This principle needs to be changed if we
   ever get to address tickets 1849/2977, but then we can still store (for efficiency) the parser
   state at the beginning of the paragraph, and safely walk backwards if we don't cross an escape
   character.

   The parser-formatter, however, definitely needs to carry a state across lines. Currently this
   state contains:

   - The logical column (as if we didn't wrap). This is used for handling TAB characters after a
   wordwrap consistently with less.

   - Whether the last nroff character was bold or underlined. This is used for displaying the
   ambiguous _\b_ sequence consistently with less.

   - Whether the desired way of displaying a lonely combining accent or spacing mark is to place it
   over a dotted circle (we do this at the beginning of the paragraph of after a TAB), or to ignore
   the combining char and show replacement char for the spacing mark (we do this if e.g. too many
   of these were encountered and hence we don't glue them with their base character).

   - (This state needs to be expanded if e.g. we decide to print verbose replacement characters
   (e.g. "<U+0080>") and allow these to wrap around lines.)

   The state also contains the file offset, as it doesn't make sense to ever know the state without
   knowing the corresponding offset.

   The state depends on various settings (viewer width, encoding, nroff mode, charwrap or wordwrap
   mode (if we'll have that one day) etc.), needs to be recomputed if any of these changes.

   Walking forwards is usually relatively easy both in the file and on the screen. Walking
   backwards within a paragraph would only be possible in some special cases and even then it would
   be painful, so we always walk back to the beginning of the paragraph and reparse-reformat from
   there.

   (Walking back within a line in the file would have at least the following difficulties: handling
   the parser state; processing invalid UTF-8; processing invalid nroff (e.g. what is "_\bA\bA"?).
   Walking back on the display: we wouldn't know where to display the last line of a paragraph, or
   where to display a line if its following line starts with a wide (CJK or Tab) character. Long
   story short: just forget this approach.)

   Most important variables:

   - dpy_start: Both in unwrap and wrap modes this points to the beginning of the topmost displayed
   paragraph.

   - dpy_text_column: Only in unwrap mode, an additional horizontal scroll.

   - dpy_paragraph_skip_lines: Only in wrap mode, an additional vertical scroll (the number of
   lines that are scrolled off at the top from the topmost paragraph).

   - dpy_state_top: Only in wrap mode, the offset and parser-formatter state at the line where
   displaying the file begins is cached here.

   - dpy_wrap_dirty: If some parameter has changed that makes it necessary to reparse-redisplay the
   topmost paragraph.

   In wrap mode, the three variables "dpy_start", "dpy_paragraph_skip_lines" and "dpy_state_top"
   are kept consistent. Think of the first two as the ones describing the position, and the third
   as a cached value for better performance so that we don't need to wrap the invisible beginning
   of the topmost paragraph over and over again. The third value needs to be recomputed each time a
   parameter that influences parsing or displaying the file (e.g. width of screen, encoding, nroff
   mode) changes, this is signaled by "dpy_wrap_dirty" to force recomputing "dpy_state_top" (and
   clamp "dpy_paragraph_skip_lines" if necessary).

   ------------------------------------------------------------------------------------------------

   Help integration

   I'm planning to port the help viewer to this codebase.

   Splitting at sections would still happen in the help viewer. It would either copy a section, or
   set force_max and a similar force_min to limit displaying to one section only.

   Parsing the help format would go next to the nroff parser. The colors, alternate character set,
   and emitting the version number would go to the "state". (The version number would be
   implemented by emitting remaining characters of a buffer in the "state" one by one, without
   advancing in the file position.)

   The active link would be drawn similarly to the search highlight. Other than that, the viewer
   wouldn't care about links (except for their color). help.c would keep track of which one is
   highlighted, how to advance to the next/prev on an arrow, how the scroll offset needs to be
   adjusted when moving, etc.

   Add wrapping at word boundaries to where wrapping at char boundaries happens now.
 */

#include <config.h>

#include "lib/global.h"
#include "lib/tty/tty.h"
#include "lib/skin.h"
#include "lib/util.h"           /* is_printable() */
#ifdef HAVE_CHARSET
#include "lib/charsets.h"
#endif

#include "src/setup.h"          /* option_tab_spacing */

#include "internal.h"

/*** global variables ****************************************************************************/

/*** file scope macro definitions ****************************************************************/

/* The Unicode standard recommends that lonely combining characters are printed over a dotted
 * circle. If the terminal is not UTF-8, this will be replaced by a dot anyway. */
#define BASE_CHARACTER_FOR_LONELY_COMBINING 0x25CC      /* dotted circle */
#define MAX_COMBINING_CHARS 4   /* both slang and ncurses support exactly 4 */

/* I think anything other than space (e.g. arrows) just introduce visual clutter without actually
 * adding value. */
#define PARTIAL_CJK_AT_LEFT_MARGIN  ' '
#define PARTIAL_CJK_AT_RIGHT_MARGIN ' '

/*
 * Wrap mode: This is for safety so that jumping to the end of file (which already includes
 * scrolling back by a page) and then walking backwards is reasonably fast, even if the file is
 * extremely large and consists of maybe full zeros or something like that. If there's no newline
 * found within this limit, just start displaying from there and see what happens. We might get
 * some displaying parameters (most importantly the columns) incorrect, but at least will show the
 * file without spinning the CPU for ages. When scrolling back to that point, the user might see a
 * garbled first line (even starting with an invalid partial UTF-8), but then walking back by yet
 * another line should fix it.
 *
 * Unwrap mode: This is not used, we wouldn't be able to do anything reasonable without walking
 * back a whole paragraph (well, view->data_area.height paragraphs actually).
 */
#define MAX_BACKWARDS_WALK_IN_PARAGRAPH (100 * 1000)

/*** file scope type declarations ****************************************************************/

/*** forward declarations (file scope functions) *************************************************/

/*** file scope variables ************************************************************************/

/* --------------------------------------------------------------------------------------------- */
/*** file scope functions ************************************************************************/
/* --------------------------------------------------------------------------------------------- */

/* TODO: These methods shouldn't be necessary, see ticket 3257 */

static int
mcview_wcwidth (const WView * view, int c)
{
#ifdef HAVE_CHARSET
    if (view->utf8)
    {
        if (g_unichar_iswide (c))
            return 2;
        if (g_unichar_iszerowidth (c))
            return 0;
    }
#else
    (void) view;
    (void) c;
#endif /* HAVE_CHARSET */
    return 1;
}

/* --------------------------------------------------------------------------------------------- */

static gboolean
mcview_ismark (const WView * view, int c)
{
#ifdef HAVE_CHARSET
    if (view->utf8)
        return g_unichar_ismark (c);
#else
    (void) view;
    (void) c;
#endif /* HAVE_CHARSET */
    return FALSE;
}

/* --------------------------------------------------------------------------------------------- */

/* actually is_non_spacing_mark_or_enclosing_mark */
static gboolean
mcview_is_non_spacing_mark (const WView * view, int c)
{
#ifdef HAVE_CHARSET
    if (view->utf8)
    {
        GUnicodeType type;

        type = g_unichar_type (c);

        return type == G_UNICODE_NON_SPACING_MARK || type == G_UNICODE_ENCLOSING_MARK;
    }
#else
    (void) view;
    (void) c;
#endif /* HAVE_CHARSET */
    return FALSE;
}

/* --------------------------------------------------------------------------------------------- */

#if 0
static gboolean
mcview_is_spacing_mark (const WView * view, int c)
{
#ifdef HAVE_CHARSET
    if (view->utf8)
        return g_unichar_type (c) == G_UNICODE_SPACING_MARK;
#else
    (void) view;
    (void) c;
#endif /* HAVE_CHARSET */
    return FALSE;
}
#endif /* 0 */

/* --------------------------------------------------------------------------------------------- */

static gboolean
mcview_isprint (const WView * view, int c)
{
#ifdef HAVE_CHARSET
    if (!view->utf8)
        c = convert_from_8bit_to_utf_c ((unsigned char) c, view->converter);
    return g_unichar_isprint (c);
#else
    (void) view;
    /* TODO this is very-very buggy by design: ticket 3257 comments 0-1 */
    return is_printable (c);
#endif /* HAVE_CHARSET */
}

/* --------------------------------------------------------------------------------------------- */

static int
mcview_char_display (const WView * view, int c, char *s)
{
#ifdef HAVE_CHARSET
    if (mc_global.utf8_display)
    {
        if (!view->utf8)
            c = convert_from_8bit_to_utf_c ((unsigned char) c, view->converter);
        if (!g_unichar_isprint (c))
            c = '.';
        return g_unichar_to_utf8 (c, s);
    }
    if (view->utf8)
    {
        if (g_unichar_iswide (c))
        {
            s[0] = s[1] = '.';
            return 2;
        }
        if (g_unichar_iszerowidth (c))
            return 0;
        /* TODO the is_printable check below will be broken for this */
        c = convert_from_utf_to_current_c (c, view->converter);
    }
    else
    {
        /* TODO the is_printable check below will be broken for this */
        c = convert_to_display_c (c);
    }
#else
    (void) view;
#endif /* HAVE_CHARSET */
    /* TODO this is very-very buggy by design: ticket 3257 comments 0-1 */
    if (!is_printable (c))
        c = '.';
    *s = c;
    return 1;
}

/* --------------------------------------------------------------------------------------------- */

/**
 * Just for convenience, a common interface in front of mcview_get_utf and mcview_get_byte, so that
 * the caller doesn't have to care about utf8 vs 8-bit modes.
 *
 * Normally: stores c, updates state, returns TRUE.
 * At EOF: state is unchanged, c is undefined, returns FALSE.
 *
 * Just as with mcview_get_utf(), invalid UTF-8 is reported using negative integers.
 *
 * Also, temporary hack: handle force_max here.
 * TODO: move it to lower layers (datasource.c)?
 */
static gboolean
mcview_get_next_char (WView * view, mcview_state_machine_t * state, int *c)
{
    /* Pretend EOF if we reached force_max */
    if (view->force_max >= 0 && state->offset >= view->force_max)
        return FALSE;

#ifdef HAVE_CHARSET
    if (view->utf8)
    {
        int char_length = 0;

        if (!mcview_get_utf (view, state->offset, c, &char_length))
            return FALSE;
        /* Pretend EOF if we crossed force_max */
        if (view->force_max >= 0 && state->offset + char_length > view->force_max)
            return FALSE;

        state->offset += char_length;
        return TRUE;
    }
#endif /* HAVE_CHARSET */
    if (!mcview_get_byte (view, state->offset, c))
        return FALSE;
    state->offset++;
    return TRUE;
}

/* --------------------------------------------------------------------------------------------- */
/**
 * This function parses the next nroff character and gives it to you along with its desired color,
 * so you never have to care about nroff again.
 *
 * The nroff mode does the backspace trick for every single character (Unicode codepoint). At least
 * that's what the GNU groff 1.22 package produces, and that's what less 458 expects. For
 * double-wide characters (CJK), still only a single backspace is emitted. For combining accents
 * and such, the print-backspace-print step is repeated for the base character and then for each
 * accent separately.
 *
 * So, the right place for this layer is after the bytes are interpreted in UTF-8, but before
 * joining a base character with its combining accents.
 *
 * Normally: stores c and color, updates state, returns TRUE.
 * At EOF: state is unchanged, c and color are undefined, returns FALSE.
 *
 * color can be null if the caller doesn't care.
 */
static gboolean
mcview_get_next_maybe_nroff_char (WView * view, mcview_state_machine_t * state, int *c, int *color)
{
    mcview_state_machine_t state_after_nroff;
    int c2, c3;

    if (color != NULL)
        *color = VIEW_NORMAL_COLOR;

    if (!view->mode_flags.nroff)
        return mcview_get_next_char (view, state, c);

    if (!mcview_get_next_char (view, state, c))
        return FALSE;
    /* Don't allow nroff formatting around CR, LF, TAB or other special chars */
    if (!mcview_isprint (view, *c))
        return TRUE;

    state_after_nroff = *state;

    if (!mcview_get_next_char (view, &state_after_nroff, &c2))
        return TRUE;
    if (c2 != '\b')
        return TRUE;

    if (!mcview_get_next_char (view, &state_after_nroff, &c3))
        return TRUE;
    if (!mcview_isprint (view, c3))
        return TRUE;

    if (*c == '_' && c3 == '_')
    {
        *state = state_after_nroff;
        if (color != NULL)
            *color =
                state->nroff_underscore_is_underlined ? VIEW_UNDERLINED_COLOR : VIEW_BOLD_COLOR;
    }
    else if (*c == c3)
    {
        *state = state_after_nroff;
        state->nroff_underscore_is_underlined = FALSE;
        if (color != NULL)
            *color = VIEW_BOLD_COLOR;
    }
    else if (*c == '_')
    {
        *c = c3;
        *state = state_after_nroff;
        state->nroff_underscore_is_underlined = TRUE;
        if (color != NULL)
            *color = VIEW_UNDERLINED_COLOR;
    }

    return TRUE;
}

/* --------------------------------------------------------------------------------------------- */
/**
 * Get one base character, along with its combining or spacing mark characters.
 *
 * (A spacing mark is a character that extends the base character's width 1 into a combined
 * character of width 2, yet these two character cells should not be separated. E.g. Devanagari
 * <U+0939><U+094B>.)
 *
 * This method exists mainly for two reasons. One is to be able to tell if we fit on the current
 * line or need to wrap to the next one. The other is that both slang and ncurses seem to require
 * that the character and its combining marks are printed in a single call (or is it just a
 * limitation of mc's wrapper to them?).
 *
 * For convenience, this method takes care of converting CR or CR+LF into LF.
 * TODO this should probably happen later, when displaying the file?
 *
 * Normally: stores cs and color, updates state, returns >= 1 (entries in cs).
 * At EOF: state is unchanged, cs and color are undefined, returns 0.
 *
 * @param view ...
 * @param state the parser-formatter state machine's state, updated
 * @param cs store the characters here
 * @param clen the room available in cs (that is, at most clen-1 combining marks are allowed), must
 *   be at least 2
 * @param color if non-NULL, store the color here, taken from the first codepoint's color
 * @return the number of entries placed in cs, or 0 on EOF
 */
static int
mcview_next_combining_char_sequence (WView * view, mcview_state_machine_t * state, int *cs,
                                     int clen, int *color)
{
    int i = 1;

    if (!mcview_get_next_maybe_nroff_char (view, state, cs, color))
        return 0;

    /* Process \r and \r\n newlines. */
    if (cs[0] == '\r')
    {
        int cnext;

        mcview_state_machine_t state_after_crlf = *state;
        if (mcview_get_next_maybe_nroff_char (view, &state_after_crlf, &cnext, NULL)
            && cnext == '\n')
            *state = state_after_crlf;
        cs[0] = '\n';
        return 1;
    }

    /* We don't want combining over non-printable characters. This includes '\n' and '\t' too. */
    if (!mcview_isprint (view, cs[0]))
        return 1;

    if (mcview_ismark (view, cs[0]))
    {
        if (!state->print_lonely_combining)
        {
            /* First character is combining. Either just return it, ... */
            return 1;
        }
        else
        {
            /* or place this (and subsequent combining ones) over a dotted circle. */
            cs[1] = cs[0];
            cs[0] = BASE_CHARACTER_FOR_LONELY_COMBINING;
            i = 2;
        }
    }

    if (mcview_wcwidth (view, cs[0]) == 2)
    {
        /* Don't allow combining or spacing mark for wide characters, is this okay? */
        return 1;
    }

    /* Look for more combining chars. Either at most clen-1 zero-width combining chars,
     * or at most 1 spacing mark. Is this logic correct? */
    for (; i < clen; i++)
    {
        mcview_state_machine_t state_after_combining;

        state_after_combining = *state;
        if (!mcview_get_next_maybe_nroff_char (view, &state_after_combining, &cs[i], NULL))
            return i;
        if (!mcview_ismark (view, cs[i]) || !mcview_isprint (view, cs[i]))
            return i;
        if (g_unichar_type (cs[i]) == G_UNICODE_SPACING_MARK)
        {
            /* Only allow as the first combining char. Stop processing in either case. */
            if (i == 1)
            {
                *state = state_after_combining;
                i++;
            }
            return i;
        }
        *state = state_after_combining;
    }
    return i;
}

/* --------------------------------------------------------------------------------------------- */
/**
 * Parse, format and possibly display one visual line of text.
 *
 * Formatting starts at the given "state" (which encodes the file offset and parser and formatter's
 * internal state). In unwrap mode, this should point to the beginning of the paragraph with the
 * default state, the additional horizontal scrolling is added here. In wrap mode, this should
 * point to the beginning of the line, with the proper state at that point.
 *
 * In wrap mode, if a line ends in a newline, it is consumed, even if it's exactly at the right
 * edge. In unwrap mode, the whole remaining line, including the newline is consumed. Displaying
 * the next line should start at "state"'s new value, or if we displayed the bottom line then
 * state->offset tells the file offset to be shown in the top bar.
 *
 * If "row" is offscreen, don't actually display the line but still update "state" and return the
 * proper value. This is used by mcview_wrap_move_down to advance in the file.
 *
 * @param view ...
 * @param state the parser-formatter state machine's state, updated
 * @param row print to this row
 * @param paragraph_ended store TRUE if paragraph ended by newline or EOF, FALSE if wraps to next
 *   line
 * @param linewidth store the width of the line here
 * @return the number of rows, that is, 0 if we were already at EOF, otherwise 1
 */
static int
mcview_display_line (WView * view, mcview_state_machine_t * state, int row,
                     gboolean * paragraph_ended, off_t * linewidth)
{
    const WRect *r = &view->data_area;
    off_t dpy_text_column = view->mode_flags.wrap ? 0 : view->dpy_text_column;
    int col = 0;
    int cs[1 + MAX_COMBINING_CHARS];
    char str[(1 + MAX_COMBINING_CHARS) * UTF8_CHAR_LEN + 1];
    int i, j;

    if (paragraph_ended != NULL)
        *paragraph_ended = TRUE;

    if (!view->mode_flags.wrap && (row < 0 || row >= r->lines) && linewidth == NULL)
    {
        /* Optimization: Fast forward to the end of the line, rather than carefully
         * parsing and then not actually displaying it. */
        off_t eol;
        int retval;

        eol = mcview_eol (view, state->offset);
        retval = (eol > state->offset) ? 1 : 0;

        mcview_state_machine_init (state, eol);
        return retval;
    }

    while (TRUE)
    {
        int charwidth = 0;
        mcview_state_machine_t state_saved;
        int n;
        int color;

        state_saved = *state;
        n = mcview_next_combining_char_sequence (view, state, cs, 1 + MAX_COMBINING_CHARS, &color);
        if (n == 0)
        {
            if (linewidth != NULL)
                *linewidth = col;
            return (col > 0) ? 1 : 0;
        }

        if (view->search_start <= state->offset && state->offset < view->search_end)
            color = VIEW_SELECTED_COLOR;

        if (cs[0] == '\n')
        {
            /* New line: reset all formatting state for the next paragraph. */
            mcview_state_machine_init (state, state->offset);
            if (linewidth != NULL)
                *linewidth = col;
            return 1;
        }

        if (mcview_is_non_spacing_mark (view, cs[0]))
        {
            /* Lonely combining character. Probably leftover after too many combining chars. Just ignore. */
            continue;
        }

        /* Nonprintable, or lonely spacing mark */
        if ((!mcview_isprint (view, cs[0]) || mcview_ismark (view, cs[0])) && cs[0] != '\t')
            cs[0] = '.';

        for (i = 0; i < n; i++)
            charwidth += mcview_wcwidth (view, cs[i]);

        /* Adjust the width for TAB. It's handled below along with the normal characters,
         * so that it's wrapped consistently with them, and is painted with the proper
         * attributes (although currently it can't have a special color). */
        if (cs[0] == '\t')
        {
            charwidth = option_tab_spacing - state->unwrapped_column % option_tab_spacing;
            state->print_lonely_combining = TRUE;
        }
        else
            state->print_lonely_combining = FALSE;

        /* In wrap mode only: We're done with this row if the character sequence wouldn't fit.
         * Except if at the first column, because then it wouldn't fit in the next row either.
         * In this extreme case let the unwrapped code below do its best to display it. */
        if (view->mode_flags.wrap && (off_t) col + charwidth > dpy_text_column + (off_t) r->cols
            && col > 0)
        {
            *state = state_saved;
            if (paragraph_ended != NULL)
                *paragraph_ended = FALSE;
            if (linewidth != NULL)
                *linewidth = col;
            return 1;
        }

        /* Display, unless outside of the viewport. */
        if (row >= 0 && row < r->lines)
        {
            if ((off_t) col >= dpy_text_column &&
                (off_t) col + charwidth <= dpy_text_column + (off_t) r->cols)
            {
                /* The combining character sequence fits entirely in the viewport. Print it. */
                tty_setcolor (color);
                widget_gotoyx (view, r->y + row, r->x + ((off_t) col - dpy_text_column));
                if (cs[0] == '\t')
                {
                    for (i = 0; i < charwidth; i++)
                        tty_print_char (' ');
                }
                else
                {
                    j = 0;
                    for (i = 0; i < n; i++)
                        j += mcview_char_display (view, cs[i], str + j);
                    str[j] = '\0';
                    /* This is probably a bug in our tty layer, but tty_print_string
                     * normalizes the string, whereas tty_printf doesn't. Don't normalize,
                     * since we handle combining characters ourselves correctly, it's
                     * better if they are copy-pasted correctly. Ticket 3255. */
                    tty_printf ("%s", str);
                }
            }
            else if ((off_t) col < dpy_text_column && (off_t) col + charwidth > dpy_text_column)
            {
                /* The combining character sequence would cross the left edge of the viewport.
                 * This cannot happen with wrap mode. Print replacement character(s),
                 * or spaces with the correct attributes for partial Tabs. */
                tty_setcolor (color);
                for (i = dpy_text_column;
                     i < (off_t) col + charwidth && i < dpy_text_column + (off_t) r->cols; i++)
                {
                    widget_gotoyx (view, r->y + row, r->x + (i - dpy_text_column));
                    tty_print_anychar ((cs[0] == '\t') ? ' ' : PARTIAL_CJK_AT_LEFT_MARGIN);
                }
            }
            else if ((off_t) col < dpy_text_column + (off_t) r->cols &&
                     (off_t) col + charwidth > dpy_text_column + (off_t) r->cols)
            {
                /* The combining character sequence would cross the right edge of the viewport
                 * and we're not wrapping. Print replacement character(s),
                 * or spaces with the correct attributes for partial Tabs. */
                tty_setcolor (color);
                for (i = col; i < dpy_text_column + (off_t) r->cols; i++)
                {
                    widget_gotoyx (view, r->y + row, r->x + (i - dpy_text_column));
                    tty_print_anychar ((cs[0] == '\t') ? ' ' : PARTIAL_CJK_AT_RIGHT_MARGIN);
                }
            }
        }

        col += charwidth;
        state->unwrapped_column += charwidth;

        if (!view->mode_flags.wrap && (off_t) col >= dpy_text_column + (off_t) r->cols
            && linewidth == NULL)
        {
            /* Optimization: Fast forward to the end of the line, rather than carefully
             * parsing and then not actually displaying it. */
            off_t eol;

            eol = mcview_eol (view, state->offset);
            mcview_state_machine_init (state, eol);
            return 1;
        }
    }
}

/* --------------------------------------------------------------------------------------------- */
/**
 * Parse, format and possibly display one paragraph (perhaps not from the beginning).
 *
 * Formatting starts at the given "state" (which encodes the file offset and parser and formatter's
 * internal state). In unwrap mode, this should point to the beginning of the paragraph with the
 * default state, the additional horizontal scrolling is added here. In wrap mode, this may point
 * to the beginning of the line within a paragraph (to display the partial paragraph at the top),
 * with the proper state at that point.
 *
 * Displaying the next paragraph should start at "state"'s new value, or if we displayed the bottom
 * line then state->offset tells the file offset to be shown in the top bar.
 *
 * If "row" is negative, don't display the first abs(row) lines and display the rest from the top.
 * This was a nice idea but it's now unused :)
 *
 * If "row" is too large, don't display the paragraph at all but still return the number of lines.
 * This is used when moving upwards.
 *
 * @param view ...
 * @param state the parser-formatter state machine's state, updated
 * @param row print starting at this row
 * @return the number of rows the paragraphs is wrapped to, that is, 0 if we were already at EOF,
 *   otherwise 1 in unwrap mode, >= 1 in wrap mode. We stop when reaching the bottom of the
 *   viewport, it's not counted how many more lines the paragraph would occupy
 */
static int
mcview_display_paragraph (WView * view, mcview_state_machine_t * state, int row)
{
    int lines = 0;

    while (TRUE)
    {
        gboolean paragraph_ended;

        lines += mcview_display_line (view, state, row, &paragraph_ended, NULL);
        if (paragraph_ended)
            return lines;

        if (row < view->data_area.lines)
        {
            row++;
            /* stop if bottom of screen reached */
            if (row >= view->data_area.lines)
                return lines;
        }
    }
}

/* --------------------------------------------------------------------------------------------- */
/**
 * Recompute dpy_state_top from dpy_start and dpy_paragraph_skip_lines. Clamp
 * dpy_paragraph_skip_lines if necessary.
 *
 * This method should be called in wrap mode after changing one of the parsing or formatting
 * properties (e.g. window width, encoding, nroff), or when switching to wrap mode from unwrap or
 * hex.
 *
 * If we stayed within the same paragraph then try to keep the vertical offset within that
 * paragraph as well. It might happen though that the paragraph became shorter than our desired
 * vertical position, in that case move to its last row.
 */
static void
mcview_wrap_fixup (WView * view)
{
    int lines = view->dpy_paragraph_skip_lines;

    if (!view->dpy_wrap_dirty)
        return;
    view->dpy_wrap_dirty = FALSE;

    view->dpy_paragraph_skip_lines = 0;
    mcview_state_machine_init (&view->dpy_state_top, view->dpy_start);

    while (lines-- != 0)
    {
        mcview_state_machine_t state_prev;
        gboolean paragraph_ended;

        state_prev = view->dpy_state_top;
        if (mcview_display_line (view, &view->dpy_state_top, -1, &paragraph_ended, NULL) == 0)
            break;
        if (paragraph_ended)
        {
            view->dpy_state_top = state_prev;
            break;
        }
        view->dpy_paragraph_skip_lines++;
    }
}

/* --------------------------------------------------------------------------------------------- */
/*** public functions ****************************************************************************/
/* --------------------------------------------------------------------------------------------- */

/**
 * In both wrap and unwrap modes, dpy_start points to the beginning of the paragraph.
 *
 * In unwrap mode, start displaying from this position, probably applying an additional horizontal
 * scroll.
 *
 * In wrap mode, an additional dpy_paragraph_skip_lines lines are skipped from the top of this
 * paragraph. dpy_state_top contains the position and parser-formatter state corresponding to the
 * top left corner so we can just start rendering from here. Unless dpy_wrap_dirty is set in which
 * case dpy_state_top is invalid and we need to recompute first.
 */
void
mcview_display_text (WView * view)
{
    const WRect *r = &view->data_area;
    int row;
    mcview_state_machine_t state;
    gboolean again;

    do
    {
        int n;

        again = FALSE;

        mcview_display_clean (view);
        mcview_display_ruler (view);

        if (!view->mode_flags.wrap)
            mcview_state_machine_init (&state, view->dpy_start);
        else
        {
            mcview_wrap_fixup (view);
            state = view->dpy_state_top;
        }

        for (row = 0; row < r->lines; row += n)
        {
            n = mcview_display_paragraph (view, &state, row);
            if (n == 0)
            {
                /* In the rare case that displaying didn't start at the beginning
                 * of the file, yet there are some empty lines at the bottom,
                 * scroll the file and display again. This happens when e.g. the
                 * window is made bigger, or the file becomes shorter due to
                 * charset change or enabling nroff. */
                if ((view->mode_flags.wrap ? view->dpy_state_top.offset : view->dpy_start) > 0)
                {
                    mcview_ascii_move_up (view, r->lines - row);
                    again = TRUE;
                }
                break;
            }
        }
    }
    while (again);

    view->dpy_end = state.offset;
    view->dpy_state_bottom = state;

    tty_setcolor (VIEW_NORMAL_COLOR);
    if (mcview_show_eof != NULL && mcview_show_eof[0] != '\0')
        while (row < r->lines)
        {
            widget_gotoyx (view, r->y + row, r->x);
            /* TODO: should make it no wider than the viewport */
            tty_print_string (mcview_show_eof);
            row++;
        }
}

/* --------------------------------------------------------------------------------------------- */
/**
 * Move down.
 *
 * It's very simple. Just invisibly format the next "lines" lines, carefully carrying the formatter
 * state in wrap mode. But before each step we need to check if we've already hit the end of the
 * file, in that case we can no longer move. This is done by walking from dpy_state_bottom.
 *
 * Note that this relies on mcview_display_text() setting dpy_state_bottom to its correct value
 * upon rendering the screen contents. So don't call this function from other functions (e.g. at
 * the bottom of mcview_ascii_move_up()) which invalidate this value.
 */
void
mcview_ascii_move_down (WView * view, off_t lines)
{
    while (lines-- != 0)
    {
        gboolean paragraph_ended;

        /* See if there's still data below the bottom line, by imaginarily displaying one
         * more line. This takes care of reading more data into growbuf, if required.
         * If the end position didn't advance, we're at EOF and hence bail out. */
        if (mcview_display_line (view, &view->dpy_state_bottom, -1, &paragraph_ended, NULL) == 0)
            break;

        /* Okay, there's enough data. Move by 1 row at the top, too. No need to check for
         * EOF, that can't happen. */
        if (!view->mode_flags.wrap)
        {
            view->dpy_start = mcview_eol (view, view->dpy_start);
            view->dpy_paragraph_skip_lines = 0;
            view->dpy_wrap_dirty = TRUE;
        }
        else
        {
            mcview_display_line (view, &view->dpy_state_top, -1, &paragraph_ended, NULL);
            if (!paragraph_ended)
                view->dpy_paragraph_skip_lines++;
            else
            {
                view->dpy_start = view->dpy_state_top.offset;
                view->dpy_paragraph_skip_lines = 0;
            }
        }
    }
}

/* --------------------------------------------------------------------------------------------- */
/**
 * Move up.
 *
 * Unwrap mode: Piece of cake. Wrap mode: If we'd walk back more than the current line offset
 * within the paragraph, we need to jump back to the previous paragraph and compute its height to
 * see if we start from that paragraph, and repeat this if necessary. Once we're within the desired
 * paragraph, we still need to format it from its beginning to know the state.
 *
 * See the top of this file for comments about MAX_BACKWARDS_WALK_IN_PARAGRAPH.
 *
 * force_max is a nice protection against the rare extreme case that the file underneath us
 * changes, we don't want to endlessly consume a file of maybe full of zeros upon moving upwards.
 */
void
mcview_ascii_move_up (WView * view, off_t lines)
{
    if (!view->mode_flags.wrap)
    {
        while (lines-- != 0)
            view->dpy_start = mcview_bol (view, view->dpy_start - 1, 0);
        view->dpy_paragraph_skip_lines = 0;
        view->dpy_wrap_dirty = TRUE;
    }
    else
    {
        int i;

        while (lines > view->dpy_paragraph_skip_lines)
        {
            /* We need to go back to the previous paragraph. */
            if (view->dpy_start == 0)
            {
                /* Oops, we're already in the first paragraph. */
                view->dpy_paragraph_skip_lines = 0;
                mcview_state_machine_init (&view->dpy_state_top, 0);
                return;
            }
            lines -= view->dpy_paragraph_skip_lines;
            view->force_max = view->dpy_start;
            view->dpy_start =
                mcview_bol (view, view->dpy_start - 1,
                            view->dpy_start - MAX_BACKWARDS_WALK_IN_PARAGRAPH);
            mcview_state_machine_init (&view->dpy_state_top, view->dpy_start);
            /* This is a tricky way of denoting that we're at the end of the paragraph.
             * Normally we'd jump to the next paragraph and reset paragraph_skip_lines. But for
             * walking backwards this is exactly what we need. */
            view->dpy_paragraph_skip_lines =
                mcview_display_paragraph (view, &view->dpy_state_top, view->data_area.lines);
            view->force_max = -1;
        }

        /* Okay, we have have dpy_start pointing to the desired paragraph, and we still need to
         * walk back "lines" lines from the current "dpy_paragraph_skip_lines" offset. We can't do
         * that, so walk from the beginning of the paragraph. */
        mcview_state_machine_init (&view->dpy_state_top, view->dpy_start);
        view->dpy_paragraph_skip_lines -= lines;
        for (i = 0; i < view->dpy_paragraph_skip_lines; i++)
            mcview_display_line (view, &view->dpy_state_top, -1, NULL, NULL);
    }
}

/* --------------------------------------------------------------------------------------------- */

void
mcview_ascii_moveto_bol (WView * view)
{
    if (!view->mode_flags.wrap)
        view->dpy_text_column = 0;
}

/* --------------------------------------------------------------------------------------------- */

void
mcview_ascii_moveto_eol (WView * view)
{
    if (!view->mode_flags.wrap)
    {
        mcview_state_machine_t state;
        off_t linewidth;

        /* Get the width of the topmost paragraph. */
        mcview_state_machine_init (&state, view->dpy_start);
        mcview_display_line (view, &state, -1, NULL, &linewidth);
        view->dpy_text_column = DOZ (linewidth, (off_t) view->data_area.cols);
    }
}

/* --------------------------------------------------------------------------------------------- */

void
mcview_state_machine_init (mcview_state_machine_t * state, off_t offset)
{
    memset (state, 0, sizeof (*state));
    state->offset = offset;
    state->print_lonely_combining = TRUE;
}

/* --------------------------------------------------------------------------------------------- */