Skip to content

Commit

Permalink
feat: count string by codepoint (#44)
Browse files Browse the repository at this point in the history
* feat: count string by codepoint

Unicode says that there are 4 ways to count string length.
https://unicode.org/faq/char_combmark.html#7

This commit supports counting by Code points.

* refactor: cut-out to strLenByCodeUnits function

ref:
- #44 (comment)

Co-authored-by: azu <[email protected]>

* chore: s/code units/codeunits/

ref:
- #44 (comment)

Co-authored-by: azu <[email protected]>

* chore: s/strLenByCodePoint/strLenByCodePoints/

---------

Co-authored-by: azu <[email protected]>
  • Loading branch information
yumetodo and azu committed Aug 14, 2024
1 parent 417f1f7 commit a6873ea
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 3 deletions.
33 changes: 30 additions & 3 deletions src/sentence-length.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ export type Options = {
* @deprecated use skipPatterns
*/
exclusionPatterns?: string[];
/**
* Determine how to count string length.
* By default or set to "codeunits", count string by UTF-16 code unit(= using `String.prototype.length`).
* If set to "codepoints", count string by codepoint.
*/
countBy?: "codeunits" | "codepoints";
};
const defaultOptions: Required<Options> = {
max: 100,
Expand All @@ -45,16 +51,37 @@ const defaultOptions: Required<Options> = {
/**
* @deprecated
*/
exclusionPatterns: []
exclusionPatterns: [],
countBy: "codeunits"
};

const isSentenceNode = (node: TxtParentNodeWithSentenceNodeContent): node is TxtSentenceNode => {
return node.type === SentenceSplitterSyntax.Sentence;
};

/**
* A count of the number of code units currently in the string.
* @param s string
*/
const strLenByCodeUnits = (s: string): number => s.length;
/**
* A count of the number of codepoint currently in the string.
*
* Complexity: O(n)
* @param s string
*/
const strLenByCodePoints = (s: string): number => {
let i = 0;
for (const _ of s) {
++i;
}
return i;
};
const reporter: TextlintRuleReporter<Options> = (context, options = {}) => {
const maxLength = options.max ?? defaultOptions.max;
const skipPatterns = options.skipPatterns ?? options.exclusionPatterns ?? defaultOptions.skipPatterns;
const skipUrlStringLink = options.skipUrlStringLink ?? defaultOptions.skipUrlStringLink;
const strLen = options.countBy == null || options.countBy === "codeunits" ? strLenByCodeUnits : strLenByCodePoints;
const helper = new RuleHelper(context);
const { Syntax, RuleError, report } = context;
const isUrlStringLink = (node: TxtSentenceNodeChildren): boolean => {
Expand Down Expand Up @@ -96,8 +123,8 @@ const reporter: TextlintRuleReporter<Options> = (context, options = {}) => {
const actualText = source.toString();
const sentenceText = removeRangeFromString(actualText, skipPatterns);
// larger than > 100
const actualTextLength = actualText.length;
const sentenceLength = sentenceText.length;
const actualTextLength = strLen(actualText);
const sentenceLength = strLen(sentenceText);
if (sentenceLength > maxLength) {
const startLine = filteredSentence.loc.start.line;
report(
Expand Down
19 changes: 19 additions & 0 deletions test/sentence-length-test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,13 @@ Shopify Functionで利用されるが、非同期処理の制限や5ms未満で
max: 10,
skipPatterns: ['/".*"/']
}
},
{
text: "𦥑井と臼井",
options: {
max: 5,
countBy: "codepoints"
}
}
],
invalid: [
Expand Down Expand Up @@ -249,6 +256,18 @@ Over 18 characters.`
max: 5,
skipUrlStringLink: false
}
},
{
text: "𦥑井と臼井",
errors: [
{
message: `Line 1 sentence length(6) exceeds the maximum sentence length of 5.
Over 1 characters.`
}
],
options: {
max: 5
}
}
]
});

0 comments on commit a6873ea

Please sign in to comment.