From 6d4e02fe5f2e79fceb6cf672f6f822714db6d0fe Mon Sep 17 00:00:00 2001 From: Gergely Nagy Date: Thu, 11 Jan 2024 00:20:32 +0100 Subject: [PATCH] Improved Linguist compatibility Recognise the `linguist-documentation` and `linguist-detectable` attributes in `.gitattributes` files, and use them in `GetLanguageStats()` to make a decision whether to include a particular file in the stats or not. This allows one more control over which files in their repositories contribute toward the language statistics, so that for a project that is mostly documentation, the language stats can reflect that. Fixes #1672. Signed-off-by: Gergely Nagy --- modules/git/repo_attribute.go | 2 +- modules/git/repo_language_stats.go | 12 + modules/git/repo_language_stats_gogit.go | 40 +-- modules/git/repo_language_stats_nogogit.go | 39 ++- tests/integration/repo_lang_stats_test.go | 276 +++++++++++++++++++++ 5 files changed, 341 insertions(+), 28 deletions(-) create mode 100644 tests/integration/repo_lang_stats_test.go diff --git a/modules/git/repo_attribute.go b/modules/git/repo_attribute.go index 2b34f117f7..3c5a1429a9 100644 --- a/modules/git/repo_attribute.go +++ b/modules/git/repo_attribute.go @@ -291,7 +291,7 @@ func (repo *Repository) CheckAttributeReader(commitID string) (*CheckAttributeRe } checker := &CheckAttributeReader{ - Attributes: []string{"linguist-vendored", "linguist-generated", "linguist-language", "gitlab-language"}, + Attributes: []string{"linguist-vendored", "linguist-generated", "linguist-language", "gitlab-language", "linguist-documentation", "linguist-detectable"}, Repo: repo, IndexFile: indexFilename, WorkTree: worktree, diff --git a/modules/git/repo_language_stats.go b/modules/git/repo_language_stats.go index c40d6937b5..7ed2dc1587 100644 --- a/modules/git/repo_language_stats.go +++ b/modules/git/repo_language_stats.go @@ -13,6 +13,18 @@ const ( bigFileSize int64 = 1024 * 1024 // 1 MiB ) +type LinguistBoolAttrib struct { + Value string +} + +func (attrib *LinguistBoolAttrib) IsTrue() bool { + return attrib.Value == "set" || attrib.Value == "true" +} + +func (attrib *LinguistBoolAttrib) IsFalse() bool { + return attrib.Value == "unset" || attrib.Value == "false" +} + // mergeLanguageStats mergers language names with different cases. The name with most upper case letters is used. func mergeLanguageStats(stats map[string]int64) map[string]int64 { names := map[string]struct { diff --git a/modules/git/repo_language_stats_gogit.go b/modules/git/repo_language_stats_gogit.go index 4c6fbd6c7e..558a83af74 100644 --- a/modules/git/repo_language_stats_gogit.go +++ b/modules/git/repo_language_stats_gogit.go @@ -1,4 +1,5 @@ // Copyright 2020 The Gitea Authors. All rights reserved. +// Copyright 2024 The Forgejo Authors c/o Codeberg e.V.. All rights reserved. // SPDX-License-Identifier: MIT //go:build gogit @@ -57,23 +58,25 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err return nil } - notVendored := false - notGenerated := false + isVendored := LinguistBoolAttrib{} + isGenerated := LinguistBoolAttrib{} + isDocumentation := LinguistBoolAttrib{} + isDetectable := LinguistBoolAttrib{} if checker != nil { attrs, err := checker.CheckPath(f.Name) if err == nil { if vendored, has := attrs["linguist-vendored"]; has { - if vendored == "set" || vendored == "true" { - return nil - } - notVendored = vendored == "false" + isVendored = LinguistBoolAttrib{Value: vendored} } if generated, has := attrs["linguist-generated"]; has { - if generated == "set" || generated == "true" { - return nil - } - notGenerated = generated == "false" + isGenerated = LinguistBoolAttrib{Value: generated} + } + if documentation, has := attrs["linguist-documentation"]; has { + isDocumentation = LinguistBoolAttrib{Value: documentation} + } + if detectable, has := attrs["linguist-detectable"]; has { + isDetectable = LinguistBoolAttrib{Value: detectable} } if language, has := attrs["linguist-language"]; has && language != "unspecified" && language != "" { // group languages, such as Pug -> HTML; SCSS -> CSS @@ -105,8 +108,11 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err } } - if (!notVendored && analyze.IsVendor(f.Name)) || enry.IsDotFile(f.Name) || - enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) { + if isDetectable.IsFalse() || isVendored.IsTrue() || isDocumentation.IsTrue() || + (!isVendored.IsFalse() && analyze.IsVendor(f.Name)) || + enry.IsDotFile(f.Name) || + enry.IsConfiguration(f.Name) || + (!isDocumentation.IsFalse() && enry.IsDocumentation(f.Name)) { return nil } @@ -115,12 +121,11 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err if f.Size <= bigFileSize { content, _ = readFile(f, fileSizeLimit) } - if !notGenerated && enry.IsGenerated(f.Name, content) { + if !isGenerated.IsTrue() && enry.IsGenerated(f.Name, content) { return nil } // TODO: Use .gitattributes file for linguist overrides - language := analyze.GetCodeLanguage(f.Name, content) if language == enry.OtherLanguage || language == "" { return nil @@ -136,6 +141,13 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err if !checked { langtype := enry.GetLanguageType(language) included = langtype == enry.Programming || langtype == enry.Markup + if !included { + if isDetectable.IsTrue() { + included = true + } else { + return nil + } + } includedLanguage[language] = included } if included { diff --git a/modules/git/repo_language_stats_nogogit.go b/modules/git/repo_language_stats_nogogit.go index 1d94ad6c00..13876094cc 100644 --- a/modules/git/repo_language_stats_nogogit.go +++ b/modules/git/repo_language_stats_nogogit.go @@ -1,4 +1,5 @@ // Copyright 2020 The Gitea Authors. All rights reserved. +// Copyright 2024 The Forgejo Authors c/o Codeberg e.V.. All rights reserved. // SPDX-License-Identifier: MIT //go:build !gogit @@ -90,23 +91,25 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err continue } - notVendored := false - notGenerated := false + isVendored := LinguistBoolAttrib{} + isGenerated := LinguistBoolAttrib{} + isDocumentation := LinguistBoolAttrib{} + isDetectable := LinguistBoolAttrib{} if checker != nil { attrs, err := checker.CheckPath(f.Name()) if err == nil { if vendored, has := attrs["linguist-vendored"]; has { - if vendored == "set" || vendored == "true" { - continue - } - notVendored = vendored == "false" + isVendored = LinguistBoolAttrib{Value: vendored} } if generated, has := attrs["linguist-generated"]; has { - if generated == "set" || generated == "true" { - continue - } - notGenerated = generated == "false" + isGenerated = LinguistBoolAttrib{Value: generated} + } + if documentation, has := attrs["linguist-documentation"]; has { + isDocumentation = LinguistBoolAttrib{Value: documentation} + } + if detectable, has := attrs["linguist-detectable"]; has { + isDetectable = LinguistBoolAttrib{Value: detectable} } if language, has := attrs["linguist-language"]; has && language != "unspecified" && language != "" { // group languages, such as Pug -> HTML; SCSS -> CSS @@ -139,8 +142,11 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err } } - if (!notVendored && analyze.IsVendor(f.Name())) || enry.IsDotFile(f.Name()) || - enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) { + if isDetectable.IsFalse() || isVendored.IsTrue() || isDocumentation.IsTrue() || + (!isVendored.IsFalse() && analyze.IsVendor(f.Name())) || + enry.IsDotFile(f.Name()) || + enry.IsConfiguration(f.Name()) || + (!isDocumentation.IsFalse() && enry.IsDocumentation(f.Name())) { continue } @@ -173,7 +179,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err return nil, err } } - if !notGenerated && enry.IsGenerated(f.Name(), content) { + if !isGenerated.IsTrue() && enry.IsGenerated(f.Name(), content) { continue } @@ -194,6 +200,13 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err if !checked { langType := enry.GetLanguageType(language) included = langType == enry.Programming || langType == enry.Markup + if !included { + if isDetectable.IsTrue() { + included = true + } else { + continue + } + } includedLanguage[language] = included } if included { diff --git a/tests/integration/repo_lang_stats_test.go b/tests/integration/repo_lang_stats_test.go new file mode 100644 index 0000000000..f3a7e4bc6d --- /dev/null +++ b/tests/integration/repo_lang_stats_test.go @@ -0,0 +1,276 @@ +// Copyright 2024 The Forgejo Authors c/o Codeberg e.V.. All rights reserved. +// SPDX-License-Identifier: MIT + +package integration + +import ( + "context" + "net/url" + "strings" + "testing" + "time" + + "code.gitea.io/gitea/models/db" + repo_model "code.gitea.io/gitea/models/repo" + "code.gitea.io/gitea/models/unittest" + user_model "code.gitea.io/gitea/models/user" + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/indexer/stats" + "code.gitea.io/gitea/modules/queue" + repo_service "code.gitea.io/gitea/services/repository" + files_service "code.gitea.io/gitea/services/repository/files" + "code.gitea.io/gitea/tests" + + "github.com/stretchr/testify/assert" +) + +func createLangStatTestRepo(t *testing.T) (*repo_model.Repository, func()) { + t.Helper() + + user2 := unittest.AssertExistsAndLoadBean(t, &user_model.User{ID: 2}) + + // Create a new repository + repo, err := repo_service.CreateRepository(db.DefaultContext, user2, user2, repo_service.CreateRepoOptions{ + Name: "lang-stat-test", + Description: "minimal repo for language stats testing", + AutoInit: true, + Gitignores: "Go", + License: "MIT", + Readme: "Default", + DefaultBranch: "main", + IsPrivate: false, + }) + assert.NoError(t, err) + assert.NotEmpty(t, repo) + + return repo, func() { + repo_service.DeleteRepository(db.DefaultContext, user2, repo, false) + } +} + +func addLangStatTestFiles(t *testing.T, repo *repo_model.Repository, contents string) string { + t.Helper() + + owner := unittest.AssertExistsAndLoadBean(t, &user_model.User{ID: repo.OwnerID}) + + addFilesResp, err := files_service.ChangeRepoFiles(git.DefaultContext, repo, owner, &files_service.ChangeRepoFilesOptions{ + Files: []*files_service.ChangeRepoFile{ + { + Operation: "create", + TreePath: ".gitattributes", + ContentReader: strings.NewReader(contents), + }, + { + Operation: "create", + TreePath: "docs.md", + ContentReader: strings.NewReader("This **is** a `markdown` file.\n"), + }, + { + Operation: "create", + TreePath: "foo.c", + ContentReader: strings.NewReader(`#include \nint main() {\n printf("Hello world!\n");\n return 0;\n}\n`), + }, + { + Operation: "create", + TreePath: "foo.nib", + ContentReader: strings.NewReader("Pinky promise, this is not a generated file!\n"), + }, + { + Operation: "create", + TreePath: ".dot.pas", + ContentReader: strings.NewReader("program Hello;\nbegin\n writeln('Hello, world.');\nend.\n"), + }, + { + Operation: "create", + TreePath: "cpplint.py", + ContentReader: strings.NewReader(`#! /usr/bin/env python\n\nprint("Hello world!")\n`), + }, + { + Operation: "create", + TreePath: "some-file.xml", + ContentReader: strings.NewReader(`\n\n Hello\n\n`), + }, + }, + Message: "add files", + OldBranch: "main", + NewBranch: "main", + Author: &files_service.IdentityOptions{ + Name: owner.Name, + Email: owner.Email, + }, + Committer: &files_service.IdentityOptions{ + Name: owner.Name, + Email: owner.Email, + }, + Dates: &files_service.CommitDateOptions{ + Author: time.Now(), + Committer: time.Now(), + }, + }) + assert.NoError(t, err) + assert.NotEmpty(t, addFilesResp) + + return addFilesResp.Commit.SHA +} + +func TestRepoLangStats(t *testing.T) { + onGiteaRun(t, func(t *testing.T, u *url.URL) { + /****************** + ** Preparations ** + ******************/ + prep := func(t *testing.T, attribs string) (*repo_model.Repository, string, func()) { + t.Helper() + + repo, f := createLangStatTestRepo(t) + sha := addLangStatTestFiles(t, repo, attribs) + + return repo, sha, f + } + + getFreshLanguageStats := func(t *testing.T, repo *repo_model.Repository, sha string) repo_model.LanguageStatList { + t.Helper() + + err := stats.UpdateRepoIndexer(repo) + assert.NoError(t, err) + + assert.NoError(t, queue.GetManager().FlushAll(context.Background(), 10*time.Second)) + + status, err := repo_model.GetIndexerStatus(db.DefaultContext, repo, repo_model.RepoIndexerTypeStats) + assert.NoError(t, err) + assert.Equal(t, sha, status.CommitSha) + langs, err := repo_model.GetTopLanguageStats(db.DefaultContext, repo, 5) + assert.NoError(t, err) + + return langs + } + + /*********** + ** Tests ** + ***********/ + + // 1. By default, documentation is not indexed + t.Run("default", func(t *testing.T) { + defer tests.PrintCurrentTest(t)() + + repo, sha, f := prep(t, "") + defer f() + + langs := getFreshLanguageStats(t, repo, sha) + + // While this is a fairly short test, this exercises a number of + // things: + // + // - `.gitattributes` is empty, so `isDetectable.IsFalse()`, + // `isVendored.IsTrue()`, and `isDocumentation.IsTrue()` will be + // false for every file, because these are only true if an + // attribute is explicitly set. + // + // - There is `.dot.pas`, which would be considered Pascal source, + // but it is a dotfile (thus, `enry.IsDotFile()` applies), and as + // such, is not considered. + // + // - `some-file.xml` will be skipped because Enry considers XML + // configuration, and `enry.IsConfiguration()` will catch it. + // + // - `!isVendored.IsFalse()` evaluates to true, so + // `analyze.isVendor()` will be called on `cpplint.py`, which will + // be considered vendored, even though both the filename and + // contents would otherwise make it Python. + // + // - `!isDocumentation.IsFalse()` evaluates to true, so + // `enry.IsDocumentation()` will be called for `docs.md`, and will + // be considered documentation, thus, skipped. + // + // Thus, this exercises all of the conditions in the first big if + // that is supposed to filter out files early. With two short asserts! + + assert.Len(t, langs, 1) + assert.Equal(t, "C", langs[0].Language) + }) + + // 2. Marking foo.c as non-detectable + t.Run("foo.c non-detectable", func(t *testing.T) { + defer tests.PrintCurrentTest(t)() + + repo, sha, f := prep(t, "foo.c linguist-detectable=false\n") + defer f() + + langs := getFreshLanguageStats(t, repo, sha) + assert.Empty(t, langs) + }) + + // 3. Marking Markdown detectable + t.Run("detectable markdown", func(t *testing.T) { + defer tests.PrintCurrentTest(t)() + + repo, sha, f := prep(t, "*.md linguist-detectable\n") + defer f() + + langs := getFreshLanguageStats(t, repo, sha) + assert.Len(t, langs, 2) + assert.Equal(t, "C", langs[0].Language) + assert.Equal(t, "Markdown", langs[1].Language) + }) + + // 4. Marking foo.c as documentation + t.Run("foo.c as documentation", func(t *testing.T) { + defer tests.PrintCurrentTest(t)() + + repo, sha, f := prep(t, "foo.c linguist-documentation\n") + defer f() + + langs := getFreshLanguageStats(t, repo, sha) + assert.Empty(t, langs) + }) + + // 5. Overriding a generated file + t.Run("linguist-generated=false", func(t *testing.T) { + defer tests.PrintCurrentTest(t)() + + repo, sha, f := prep(t, "foo.nib linguist-generated=false\nfoo.nib linguist-language=Perl\n") + defer f() + + langs := getFreshLanguageStats(t, repo, sha) + assert.Len(t, langs, 2) + assert.Equal(t, "C", langs[0].Language) + assert.Equal(t, "Perl", langs[1].Language) + }) + + // 6. Disabling vendoring for a file + t.Run("linguist-vendored=false", func(t *testing.T) { + defer tests.PrintCurrentTest(t)() + + repo, sha, f := prep(t, "cpplint.py linguist-vendored=false\n") + defer f() + + langs := getFreshLanguageStats(t, repo, sha) + assert.Len(t, langs, 2) + assert.Equal(t, "C", langs[0].Language) + assert.Equal(t, "Python", langs[1].Language) + }) + + // 7. Disabling vendoring for a file, with -linguist-vendored + t.Run("-linguist-vendored", func(t *testing.T) { + defer tests.PrintCurrentTest(t)() + + repo, sha, f := prep(t, "cpplint.py -linguist-vendored\n") + defer f() + + langs := getFreshLanguageStats(t, repo, sha) + assert.Len(t, langs, 2) + assert.Equal(t, "C", langs[0].Language) + assert.Equal(t, "Python", langs[1].Language) + }) + + // 8. Marking foo.c as vendored + t.Run("foo.c as vendored", func(t *testing.T) { + defer tests.PrintCurrentTest(t)() + + repo, sha, f := prep(t, "foo.c linguist-vendored\n") + defer f() + + langs := getFreshLanguageStats(t, repo, sha) + assert.Empty(t, langs) + }) + }) +}