{"id":807310,"name":"KeemenaPreprocessing.jl: Unicode-Robust Cleaning, Multi-Level Tokenisation \u0026amp; Streaming Offset Bundling for Julia NLP","description":"KeemenaPreprocessing.jl: Unicode-Robust Cleaning, Multi-Level Tokenisation \u0026amp; Streaming Offset Bundling for Julia NLP - Published in JOSS (2026)","url":"https://github.com/mantzaris/keemenapreprocessing.jl","last_synced_at":"2026-02-23T23:00:45.641Z","repository":{"id":299938682,"uuid":"1004672029","full_name":"mantzaris/KeemenaPreprocessing.jl","owner":"mantzaris","description":"Preprocessing for text data: cleaning, normalization, vectorization, tokenization and more","archived":false,"fork":false,"pushed_at":"2026-01-31T01:56:44.000Z","size":1016,"stargazers_count":3,"open_issues_count":1,"forks_count":0,"subscribers_count":1,"default_branch":"main","last_synced_at":"2026-01-31T16:41:23.862Z","etag":null,"topics":["julia","natural-language-processing","nlp","text-encoding","textprocessing","tokenization"],"latest_commit_sha":null,"homepage":"https://mantzaris.github.io/KeemenaPreprocessing.jl/dev/","language":"Julia","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"mit","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/mantzaris.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null,"notice":null,"maintainers":null,"copyright":null,"agents":null,"dco":null,"cla":null}},"created_at":"2025-06-19T02:16:01.000Z","updated_at":"2026-01-31T01:33:58.000Z","dependencies_parsed_at":"2025-07-03T23:24:18.389Z","dependency_job_id":"6e12e7b8-c518-4a76-8f33-b28fc9555c62","html_url":"https://github.com/mantzaris/KeemenaPreprocessing.jl","commit_stats":null,"previous_names":["mantzaris/keemenapreprocessing.jl"],"tags_count":2,"template":false,"template_full_name":null,"purl":"pkg:github/mantzaris/KeemenaPreprocessing.jl","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/mantzaris%2FKeemenaPreprocessing.jl","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/mantzaris%2FKeemenaPreprocessing.jl/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/mantzaris%2FKeemenaPreprocessing.jl/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/mantzaris%2FKeemenaPreprocessing.jl/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/mantzaris","download_url":"https://codeload.github.com/mantzaris/KeemenaPreprocessing.jl/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/mantzaris%2FKeemenaPreprocessing.jl/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":29760011,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2026-02-23T21:02:23.375Z","status":"ssl_error","status_checked_at":"2026-02-23T20:58:31.539Z","response_time":90,"last_error":"SSL_connect returned=1 errno=0 peeraddr=140.82.121.6:443 state=error: unexpected eof while reading","robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":false,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"owner":{"login":"mantzaris","name":"a.v.mantzaris","uuid":"11978807","kind":"user","description":"Excited about the future of technology. Happy to participate in shaping that future through theory and practice.","email":"","website":null,"location":"USA","twitter":"avmantzaris","company":null,"icon_url":"https://avatars.githubusercontent.com/u/11978807?u=a9183039f7194fbf5eb6dfb93a69c4e1079ca725\u0026v=4","repositories_count":35,"last_synced_at":"2025-10-10T23:07:48.985Z","metadata":{"has_sponsors_listing":false},"html_url":"https://github.com/mantzaris","funding_links":[],"total_stars":58,"followers":45,"following":95,"created_at":"2022-11-20T11:23:27.331Z","updated_at":"2025-10-10T23:07:48.986Z","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/mantzaris","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/mantzaris/repositories"},"packages":[{"id":12023122,"name":"KeemenaPreprocessing","ecosystem":"julia","description":"Preprocessing for text data: cleaning, normalization, vectorization, tokenization and more","homepage":"https://mantzaris.github.io/KeemenaPreprocessing.jl/dev/","licenses":"MIT","normalized_licenses":["MIT"],"repository_url":"https://github.com/mantzaris/KeemenaPreprocessing.jl","keywords_array":["julia","natural-language-processing","nlp","text-encoding","textprocessing","tokenization"],"namespace":null,"versions_count":2,"first_release_published_at":"2025-08-01T00:00:00.000Z","latest_release_published_at":"2026-01-01T00:00:00.000Z","latest_release_number":"0.1.1","last_synced_at":"2026-02-16T09:12:28.358Z","created_at":"2025-08-20T00:38:38.079Z","updated_at":"2026-02-16T09:21:34.051Z","registry_url":"https://juliahub.com/ui/Packages/General/KeemenaPreprocessing/","install_command":"Pkg.add(\"KeemenaPreprocessing\")","documentation_url":"https://docs.juliahub.com/General/KeemenaPreprocessing/stable/","metadata":{"uuid":"f4ce45ae-2b88-40b3-ad81-d2d70f7eb3a1"},"repo_metadata":{"id":299938682,"uuid":"1004672029","full_name":"mantzaris/KeemenaPreprocessing.jl","owner":"mantzaris","description":"Preprocessing for text data: cleaning, normalization, vectorization, tokenization and more","archived":false,"fork":false,"pushed_at":"2025-09-23T00:34:42.000Z","size":548,"stargazers_count":1,"open_issues_count":1,"forks_count":0,"subscribers_count":1,"default_branch":"main","last_synced_at":"2025-09-24T00:32:40.172Z","etag":null,"topics":["julia","natural-language-processing","nlp","text-encoding","textprocessing","tokenization"],"latest_commit_sha":null,"homepage":"https://mantzaris.github.io/KeemenaPreprocessing.jl/dev/","language":"Julia","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"mit","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/mantzaris.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null}},"created_at":"2025-06-19T02:16:01.000Z","updated_at":"2025-08-20T11:12:49.000Z","dependencies_parsed_at":"2025-07-03T23:24:18.389Z","dependency_job_id":"6e12e7b8-c518-4a76-8f33-b28fc9555c62","html_url":"https://github.com/mantzaris/KeemenaPreprocessing.jl","commit_stats":null,"previous_names":["mantzaris/keemenapreprocessing.jl"],"tags_count":1,"template":false,"template_full_name":null,"purl":"pkg:github/mantzaris/KeemenaPreprocessing.jl","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/mantzaris%2FKeemenaPreprocessing.jl","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/mantzaris%2FKeemenaPreprocessing.jl/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/mantzaris%2FKeemenaPreprocessing.jl/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/mantzaris%2FKeemenaPreprocessing.jl/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/mantzaris","download_url":"https://codeload.github.com/mantzaris/KeemenaPreprocessing.jl/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/mantzaris%2FKeemenaPreprocessing.jl/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":276683835,"owners_count":25685629,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","status":"online","status_checked_at":"2025-09-24T02:00:09.776Z","response_time":97,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"},"owner_record":{"login":"mantzaris","name":"a.v.mantzaris","uuid":"11978807","kind":"user","description":"Excited about the future of technology. Happy to participate in shaping that future through theory and practice.","email":"","website":null,"location":"USA","twitter":"avmantzaris","company":null,"icon_url":"https://avatars.githubusercontent.com/u/11978807?u=a9183039f7194fbf5eb6dfb93a69c4e1079ca725\u0026v=4","repositories_count":34,"last_synced_at":"2025-09-24T00:26:54.349Z","metadata":{"has_sponsors_listing":false},"html_url":"https://github.com/mantzaris","funding_links":[],"total_stars":57,"followers":47,"following":95,"created_at":"2022-11-20T11:23:27.331Z","updated_at":"2025-09-24T00:26:54.349Z","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/mantzaris","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/mantzaris/repositories"},"tags":[{"name":"v0.1.0","sha":"6614ccdb2d1311c295d7b4ea162210f694ad6d48","kind":"tag","published_at":"2025-08-20T00:06:22.000Z","download_url":"https://codeload.github.com/mantzaris/KeemenaPreprocessing.jl/tar.gz/v0.1.0","html_url":"https://github.com/mantzaris/KeemenaPreprocessing.jl/releases/tag/v0.1.0","dependencies_parsed_at":null,"dependency_job_id":null,"purl":"pkg:github/mantzaris/KeemenaPreprocessing.jl@v0.1.0","tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/mantzaris%2FKeemenaPreprocessing.jl/tags/v0.1.0","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/mantzaris%2FKeemenaPreprocessing.jl/tags/v0.1.0/manifests"}]},"repo_metadata_updated_at":"2026-02-16T09:21:34.030Z","dependent_packages_count":0,"downloads":1,"downloads_period":"total","dependent_repos_count":0,"rankings":{"downloads":null,"dependent_repos_count":8.15566299437267,"dependent_packages_count":35.111357692002855,"stargazers_count":null,"forks_count":null,"docker_downloads_count":null,"average":21.633510343187762},"purl":"pkg:julia/KeemenaPreprocessing","advisories":[],"docker_usage_url":"https://docker.ecosyste.ms/usage/julia/KeemenaPreprocessing","docker_dependents_count":null,"docker_downloads_count":null,"usage_url":"https://repos.ecosyste.ms/usage/julia/KeemenaPreprocessing","dependent_repositories_url":"https://repos.ecosyste.ms/api/v1/usage/julia/KeemenaPreprocessing/dependencies","status":null,"funding_links":[],"critical":null,"issue_metadata":{"last_synced_at":"2025-09-01T01:05:06.363Z","issues_count":0,"pull_requests_count":0,"avg_time_to_close_issue":null,"avg_time_to_close_pull_request":null,"issues_closed_count":0,"pull_requests_closed_count":0,"pull_request_authors_count":0,"issue_authors_count":0,"avg_comments_per_issue":null,"avg_comments_per_pull_request":null,"merged_pull_requests_count":0,"bot_issues_count":0,"bot_pull_requests_count":0,"past_year_issues_count":0,"past_year_pull_requests_count":0,"past_year_avg_time_to_close_issue":null,"past_year_avg_time_to_close_pull_request":null,"past_year_issues_closed_count":0,"past_year_pull_requests_closed_count":0,"past_year_pull_request_authors_count":0,"past_year_issue_authors_count":0,"past_year_avg_comments_per_issue":null,"past_year_avg_comments_per_pull_request":null,"past_year_bot_issues_count":0,"past_year_bot_pull_requests_count":0,"past_year_merged_pull_requests_count":0,"issues_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories/mantzaris%2FKeemenaPreprocessing.jl/issues","maintainers":[],"active_maintainers":[]},"versions_url":"https://packages.ecosyste.ms/api/v1/registries/juliahub.com/packages/KeemenaPreprocessing/versions","version_numbers_url":"https://packages.ecosyste.ms/api/v1/registries/juliahub.com/packages/KeemenaPreprocessing/version_numbers","dependent_packages_url":"https://packages.ecosyste.ms/api/v1/registries/juliahub.com/packages/KeemenaPreprocessing/dependent_packages","related_packages_url":"https://packages.ecosyste.ms/api/v1/registries/juliahub.com/packages/KeemenaPreprocessing/related_packages","codemeta_url":"https://packages.ecosyste.ms/api/v1/registries/juliahub.com/packages/KeemenaPreprocessing/codemeta","maintainers":[],"registry":{"name":"juliahub.com","url":"https://juliahub.com","ecosystem":"julia","default":true,"packages_count":13347,"maintainers_count":0,"namespaces_count":0,"keywords_count":0,"github":"JuliaRegistries","metadata":{"funded_packages_count":845},"icon_url":"https://github.com/JuliaRegistries.png","created_at":"2022-04-19T16:34:08.340Z","updated_at":"2026-02-22T09:07:04.189Z","packages_url":"https://packages.ecosyste.ms/api/v1/registries/juliahub.com/packages","maintainers_url":"https://packages.ecosyste.ms/api/v1/registries/juliahub.com/maintainers","namespaces_url":"https://packages.ecosyste.ms/api/v1/registries/juliahub.com/namespaces"}}],"commits":{"id":10500441,"full_name":"mantzaris/KeemenaPreprocessing.jl","default_branch":"main","total_commits":70,"total_committers":1,"total_bot_commits":0,"total_bot_committers":0,"mean_commits":70.0,"dds":0.0,"past_year_total_commits":70,"past_year_total_committers":1,"past_year_total_bot_commits":0,"past_year_total_bot_committers":0,"past_year_mean_commits":70.0,"past_year_dds":0.0,"last_synced_at":"2025-09-24T05:12:20.779Z","last_synced_commit":"f6c2c8b35b25469f5aa917937730da04ee509227","created_at":"2025-06-30T01:01:56.662Z","updated_at":"2025-09-24T05:12:17.009Z","committers":[{"name":"mantzaris","email":"avmantzaris@gmail.com","login":null,"count":70}],"past_year_committers":[{"name":"mantzaris","email":"avmantzaris@gmail.com","login":null,"count":70}],"commits_url":"https://commits.ecosyste.ms/api/v1/hosts/GitHub/repositories/mantzaris%2FKeemenaPreprocessing.jl/commits","host":{"name":"GitHub","url":"https://github.com","kind":"github","last_synced_at":"2026-02-23T00:00:12.369Z","repositories_count":6181675,"commits_count":929830952,"contributors_count":36026246,"owners_count":1145377,"icon_url":"https://github.com/github.png","host_url":"https://commits.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://commits.ecosyste.ms/api/v1/hosts/GitHub/repositories"}},"issues_stats":{"full_name":"mantzaris/KeemenaPreprocessing.jl","html_url":"https://github.com/mantzaris/KeemenaPreprocessing.jl","last_synced_at":"2026-02-06T19:07:46.294Z","status":null,"issues_count":1,"pull_requests_count":0,"avg_time_to_close_issue":1.0,"avg_time_to_close_pull_request":null,"issues_closed_count":1,"pull_requests_closed_count":0,"pull_request_authors_count":0,"issue_authors_count":1,"avg_comments_per_issue":2.0,"avg_comments_per_pull_request":null,"merged_pull_requests_count":0,"bot_issues_count":0,"bot_pull_requests_count":0,"past_year_issues_count":1,"past_year_pull_requests_count":0,"past_year_avg_time_to_close_issue":1.0,"past_year_avg_time_to_close_pull_request":null,"past_year_issues_closed_count":1,"past_year_pull_requests_closed_count":0,"past_year_pull_request_authors_count":0,"past_year_issue_authors_count":1,"past_year_avg_comments_per_issue":2.0,"past_year_avg_comments_per_pull_request":null,"past_year_bot_issues_count":0,"past_year_bot_pull_requests_count":0,"past_year_merged_pull_requests_count":0,"created_at":"2025-06-30T01:01:56.921Z","updated_at":"2026-02-06T19:07:46.295Z","repository_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories/mantzaris%2FKeemenaPreprocessing.jl","issues_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories/mantzaris%2FKeemenaPreprocessing.jl/issues","issue_labels_count":{},"pull_request_labels_count":{},"issue_author_associations_count":{"NONE":1},"pull_request_author_associations_count":{},"issue_authors":{"JuliaTagBot":1},"pull_request_authors":{},"host":{"name":"GitHub","url":"https://github.com","kind":"github","last_synced_at":"2026-02-23T00:00:08.287Z","repositories_count":13421844,"issues_count":35076597,"pull_requests_count":114307637,"authors_count":11169485,"icon_url":"https://github.com/github.png","host_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories","owners_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/owners","authors_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors"},"past_year_issue_labels_count":{},"past_year_pull_request_labels_count":{},"past_year_issue_author_associations_count":{"NONE":1},"past_year_pull_request_author_associations_count":{},"past_year_issue_authors":{"JuliaTagBot":1},"past_year_pull_request_authors":{},"maintainers":[],"active_maintainers":[]},"events":{"total":{"CreateEvent":111,"CommitCommentEvent":3,"ReleaseEvent":2,"DeleteEvent":1,"PullRequestEvent":2,"ForkEvent":1,"IssuesEvent":3,"WatchEvent":2,"IssueCommentEvent":2,"PushEvent":83},"last_year":{"CreateEvent":111,"CommitCommentEvent":3,"ReleaseEvent":2,"DeleteEvent":1,"PullRequestEvent":2,"ForkEvent":1,"IssuesEvent":3,"WatchEvent":2,"IssueCommentEvent":2,"PushEvent":83}},"keywords":["julia","natural-language-processing","nlp","text-encoding","textprocessing","tokenization"],"dependencies":[{"ecosystem":"actions","filepath":".github/workflows/CI.yml","sha":null,"kind":"manifest","created_at":"2025-06-19T03:44:17.892Z","updated_at":"2025-06-19T03:44:17.892Z","repository_link":"https://github.com/mantzaris/KeemenaPreprocessing.jl/blob/main/.github/workflows/CI.yml","dependencies":[{"id":23695814106,"package_name":"actions/checkout","ecosystem":"actions","requirements":"v4","direct":true,"kind":"composite","optional":false},{"id":23695814107,"package_name":"julia-actions/setup-julia","ecosystem":"actions","requirements":"v2","direct":true,"kind":"composite","optional":false},{"id":23695814108,"package_name":"julia-actions/cache","ecosystem":"actions","requirements":"v2","direct":true,"kind":"composite","optional":false},{"id":23695814109,"package_name":"julia-actions/julia-buildpkg","ecosystem":"actions","requirements":"v1","direct":true,"kind":"composite","optional":false},{"id":23695814110,"package_name":"julia-actions/julia-runtest","ecosystem":"actions","requirements":"v1","direct":true,"kind":"composite","optional":false},{"id":23695814111,"package_name":"julia-actions/julia-docdeploy","ecosystem":"actions","requirements":"v1","direct":true,"kind":"composite","optional":false}]},{"ecosystem":"actions","filepath":".github/workflows/CompatHelper.yml","sha":null,"kind":"manifest","created_at":"2025-06-19T03:44:17.944Z","updated_at":"2025-06-19T03:44:17.944Z","repository_link":"https://github.com/mantzaris/KeemenaPreprocessing.jl/blob/main/.github/workflows/CompatHelper.yml","dependencies":[]},{"ecosystem":"actions","filepath":".github/workflows/TagBot.yml","sha":null,"kind":"manifest","created_at":"2025-06-19T03:44:17.985Z","updated_at":"2025-06-19T03:44:17.985Z","repository_link":"https://github.com/mantzaris/KeemenaPreprocessing.jl/blob/main/.github/workflows/TagBot.yml","dependencies":[{"id":23695814284,"package_name":"JuliaRegistries/TagBot","ecosystem":"actions","requirements":"v1","direct":true,"kind":"composite","optional":false}]}],"score":1.3862943611198906,"created_at":"2026-02-23T23:00:06.773Z","updated_at":"2026-02-24T00:30:07.273Z","avatar_url":"https://github.com/mantzaris.png","language":"Julia","category":null,"sub_category":null,"monthly_downloads":0,"funding_links":[],"readme_doi_urls":[],"works":{},"citation_counts":{},"total_citations":0,"keywords_from_contributors":[],"project_url":"https://science.ecosyste.ms/api/v1/projects/807310","html_url":"https://science.ecosyste.ms/projects/807310"}