apache · rmuir · Mar 2, 2025 · Mar 2, 2025
diff --git a/dev-tools/scripts/addBackcompatIndexes.py b/dev-tools/scripts/addBackcompatIndexes.py
@@ -17,7 +17,7 @@
 
 
 # For usage information, see:
-# 
+#
 #   http://wiki.apache.org/lucene-java/ReleaseTodo#Generate_Backcompat_Indexes
 
 
@@ -49,7 +49,7 @@ def create_and_add_index(source, indextype, index_version, current_version, temp
     filename = '%s.%s-%s.zip' % (prefix, index_version, indextype)
   else:
     filename = '%s.%s.zip' % (prefix, index_version)
-  
+
   print('  creating %s...' % filename, end='', flush=True)
   module = 'backward-codecs'
   index_dir = os.path.join('lucene', module, 'src/test/org/apache/lucene/backward_index')
@@ -76,7 +76,7 @@ def create_and_add_index(source, indextype, index_version, current_version, temp
   ])
   base_dir = os.getcwd()
   bc_index_file = os.path.join(temp_dir, filename)
-  
+
   if os.path.exists(bc_index_file):
     print('alreadyexists')
   else:
@@ -85,7 +85,7 @@ def create_and_add_index(source, indextype, index_version, current_version, temp
     if not os.path.exists(bc_index_file):
       raise Exception("Expected file can't be found: %s" %bc_index_file)
     print('done')
-  
+
   print('  adding %s...' % filename, end='', flush=True)
   scriptutil.run('cp %s %s' % (bc_index_file, os.path.join(base_dir, index_dir)))
   os.chdir(base_dir)
@@ -125,7 +125,7 @@ def append(buffer, changed):
       buffer.append('\n')
     buffer.append(('%s\n') % index_version)
     return True
-        
+
   changed = scriptutil.update_file(filename, re.compile(r'.*'), edit, append)
   print('done' if changed else 'uptodate')
 
@@ -139,7 +139,7 @@ def download_from_cdn(version, remotename, localname):
   try:
     urllib.request.urlretrieve(url, localname)
     return True
-  except urllib.error.URLError as e:
+  except urllib.error.HTTPError as e:
     if e.code == 404:
       return False
     raise e
@@ -149,14 +149,14 @@ def download_from_archives(version, remotename, localname):
   try:
     urllib.request.urlretrieve(url, localname)
     return True
-  except urllib.error.URLError as e:
+  except urllib.error.HTTPError as e:
     if e.code == 404:
       return False
     raise e
 
 def download_release(version, temp_dir, force):
   print('  downloading %s source release...' % version, end='', flush=True)
-  source = os.path.join(temp_dir, 'lucene-%s' % version) 
+  source = os.path.join(temp_dir, 'lucene-%s' % version)
   if os.path.exists(source):
     if force:
       shutil.rmtree(source)
@@ -173,7 +173,7 @@ def download_release(version, temp_dir, force):
   olddir = os.getcwd()
   os.chdir(temp_dir)
   scriptutil.run('tar -xvzf %s' % source_tgz)
-  os.chdir(olddir) 
+  os.chdir(olddir)
   print('done')
   return source
 
@@ -195,9 +195,9 @@ def read_config():
   c = parser.parse_args()
 
   return c
-  
+
 def main():
-  c = read_config() 
+  c = read_config()
   if not os.path.exists(c.temp_dir):
     os.makedirs(c.temp_dir)
 
@@ -216,7 +216,7 @@ def main():
     create_and_add_index(source, 'dvupdates', c.version, current_version, c.temp_dir)
     create_and_add_index(source, 'emptyIndex', c.version, current_version, c.temp_dir)
     print ('\nMANUAL UPDATE REQUIRED: edit TestGenerateBwcIndices to enable moreterms, dvupdates, and empty index testing')
-    
+
   print('\nAdding backwards compatibility tests')
   update_backcompat_tests(c.version, current_version)
 

diff --git a/dev-tools/scripts/addVersion.py b/dev-tools/scripts/addVersion.py
@@ -37,7 +37,7 @@ def edit(buffer, match, line):
         buffer.append('%s\n---------------------\n(No changes)\n\n' % header)
     buffer.append(line)
     return match is not None
-     
+
   changed = update_file(filename, matcher, edit)
   print('done' if changed else 'uptodate')
 
@@ -53,7 +53,7 @@ def ensure_deprecated(buffer):
     if last.strip() != '@Deprecated':
       spaces = ' ' * (len(last) - len(last.lstrip()) - 1)
       del buffer[-1] # Remove comment closer line
-      if (len(buffer) >= 4 and re.search('for Lucene.\s*$', buffer[-1]) is not None):
+      if (len(buffer) >= 4 and re.search(r'for Lucene.\s*$', buffer[-1]) is not None):
         del buffer[-3:] # drop the trailing lines '<p> / Use this to get the latest ... / ... for Lucene.'
       buffer.append(( '{0} * @deprecated ({1}) Use latest\n'
                     + '{0} */\n'
@@ -73,7 +73,7 @@ def buffer_constant(buffer, line):
       buffer.append('%s@Deprecated\n' % spaces)
     buffer.append('{0}public static final Version {1} = new Version({2}, {3}, {4});\n'.format
                   (spaces, new_version.constant, new_version.major, new_version.minor, new_version.bugfix))
-  
+
   class Edit(object):
     found = -1
     def __call__(self, buffer, match, line):
@@ -97,14 +97,14 @@ def __call__(self, buffer, match, line):
 
       buffer.append(line)
       return False
-  
+
   changed = update_file(filename, matcher, Edit())
   print('done' if changed else 'uptodate')
 
 def update_build_version(new_version):
   print('  changing baseVersion...', end='', flush=True)
   filename = 'build.gradle'
-  def edit(buffer, match, line):
+  def edit(buffer, _, line):
     if new_version.dot in line:
       return None
     buffer.append('  String baseVersion = \'' + new_version.dot + '\'\n')
@@ -118,7 +118,7 @@ def update_latest_constant(new_version):
   print('  changing Version.LATEST to %s...' % new_version.constant, end='', flush=True)
   filename = 'lucene/core/src/java/org/apache/lucene/util/Version.java'
   matcher = re.compile('public static final Version LATEST')
-  def edit(buffer, match, line):
+  def edit(buffer, _, line):
     if new_version.constant in line:
       return None
     buffer.append(line.rpartition('=')[0] + ('= %s;\n' % new_version.constant))

diff --git a/dev-tools/scripts/buildAndPushRelease.py b/dev-tools/scripts/buildAndPushRelease.py
@@ -51,6 +51,8 @@ def runAndSendGPGPassword(command, password):
   p = subprocess.Popen(command, shell=True, bufsize=0, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE)
   f = open(LOG, 'ab')
   while True:
+    assert p.stdout
+    assert p.stdin
     p.stdout.flush()
     line = p.stdout.readline()
     if len(line) == 0:
@@ -176,8 +178,7 @@ def checkDOAPfiles(version):
     treeRoot = ET.parse(doapFile).getroot()
     doapRevisions = set()
     for revision in treeRoot.findall(xpathRevision):
-      match = reDoapRevision.match(revision.text)
-      if (match is not None):
+      if (revision.text and (match := reDoapRevision.match(revision.text))):
         if (match.group(1) not in ('0', '1', '2')): # Ignore 0.X, 1.X and 2.X revisions
           doapRevisions.add(normalizeVersion(match.groups()))
       else:
@@ -412,6 +413,7 @@ def main():
     print('Next run the smoker tester:')
     p = re.compile(".*/")
     m = p.match(sys.argv[0])
+    assert m
     if not c.sign:
       signed = "--not-signed"
     else:

diff --git a/dev-tools/scripts/create_line_file_docs.py b/dev-tools/scripts/create_line_file_docs.py
@@ -61,7 +61,7 @@ def compress_with_seek_points(file_name_in, file_name_out, num_seek_points):
         break
 
       bytes_in_chunk += len(line)
-      f_out.write(line)
+      f_out.write(line) # false positive in python's crazy typing # pyright: ignore[reportArgumentType]
 
       if bytes_in_chunk > bytes_per_chunk and chunk_count < num_seek_points:
         f_out.close()
@@ -72,12 +72,12 @@ def compress_with_seek_points(file_name_in, file_name_out, num_seek_points):
     for seek_point in seek_points:
       f_out.write('%d\n' % seek_point)
 
-re_tag = re.compile('<[^>]+?>')
-re_newlines = re.compile('\n+')
-re_space = re.compile('\s')
+re_tag = re.compile(r'<[^>]+?>')
+re_newlines = re.compile(r'\n+')
+re_space = re.compile(r'\s')
 
 # used to find word break, for splitting docs into ~1 KB sized smaller docs:
-re_next_non_word_character = re.compile('\W', re.U)
+re_next_non_word_character = re.compile(r'\W', re.U)
 
 EUROPARL_V7_URL = 'https://www.statmt.org/europarl/v7/europarl.tgz'
 
@@ -101,7 +101,7 @@ def split_docs(all_out, title_string, date_string, body_string):
       char_count = len(body_string)
 
     body_string_fragment = body_string[:char_count].strip()
-    
+
     #print('write title %d, body %d' % (len(title_string), len(body_string_fragment)))
     all_out.write('%s\t%s\t%s\n' % (title_string, date_string, body_string_fragment))
     body_string = body_string[char_count:]
@@ -143,7 +143,7 @@ def sample_europarl():
     next_print_time = start_time + 3
     # normalize text a bit and concatenate all lines into single file, counting total lines/bytes
     with open(all_txt_file_name, 'w', encoding='utf-8') as all_out:
-      for dir_path, dir_names, file_names in os.walk('%s/txt' % tmp_dir_path):
+      for dir_path, _, file_names in os.walk('%s/txt' % tmp_dir_path):
         for file_name in file_names:
           if file_name.endswith('.txt'):
             file_count += 1
@@ -155,7 +155,7 @@ def sample_europarl():
               year = 2000 + year
 
             date_string = '%04d-%02d-%02d' % (year, month, day)
-            
+
             # unfortunately we need errors='ignore' since in Europarl v7, one file (pl/ep-09-10-22-009.txt) has invalid utf-8:
             chapter_count = 0
             with open('%s/%s' % (dir_path, file_name), 'r', encoding='utf-8', errors='ignore') as f_in:
@@ -176,7 +176,7 @@ def sample_europarl():
                       doc_count += split_docs(all_out, last_title, date_string, s)
                     else:
                       skip_count += 1
-                      
+
                     last_text = []
                     chapter_count += 1
                   while True:
@@ -248,7 +248,7 @@ def sample_europarl():
       compress_with_seek_points(file_name_out,
                                 file_name_out + '.gz',
                                 mb)
-            
+
   finally:
     print('Removing tmp dir "%s"...' % tmp_dir_path)
     if not DEBUG:

diff --git a/dev-tools/scripts/diff_lucene_changes.py b/dev-tools/scripts/diff_lucene_changes.py
@@ -42,8 +42,10 @@ def get_changes_url(branch_name):
   return url
 
 def extract_release_section(changes_txt, release_name):
-  return re.search(f'=======+ Lucene {re.escape(release_name)} =======+(.*?)=======+ Lucene .*? =======+$',
-                   changes_txt.decode('utf-8'), re.MULTILINE | re.DOTALL).group(1).encode('utf-8')
+  match = re.search(f'=======+ Lucene {re.escape(release_name)} =======+(.*?)=======+ Lucene .*? =======+$',
+                   changes_txt.decode('utf-8'), re.MULTILINE | re.DOTALL)
+  assert match
+  return match.group(1).encode('utf-8')
 
 def main():
   if len(sys.argv) < 3 or len(sys.argv) > 5:

diff --git a/dev-tools/scripts/githubPRs.py b/dev-tools/scripts/githubPRs.py
@@ -26,8 +26,10 @@
 import argparse
 import json
 import re
+from typing import cast
 from github import Github
-from jira import JIRA
+from jira import JIRA, Issue
+from jira.client import ResultList
 from jinja2 import Environment, BaseLoader
 
 def read_config():
@@ -46,7 +48,7 @@ def out(text):
 
 def make_html(dict):
   global conf
-  template = Environment(loader=BaseLoader).from_string("""
+  template = Environment(loader=BaseLoader()).from_string("""
   <h1>Lucene Github PR report</h1>
 
   <p>Number of open Pull Requests: {{ open_count }}</p>
@@ -75,7 +77,7 @@ def main():
     gh = Github(token)
   else:
     gh = Github()
-  jira = JIRA('https://issues.apache.org/jira')
+  jira = JIRA('https://issues.apache.org/jira') # this ctor has broken types in jira library. # pyright: ignore[reportArgumentType]
   result = {}
   repo = gh.get_repo('apache/lucene')
   open_prs = repo.get_pulls(state='open')
@@ -100,19 +102,22 @@ def main():
   issue_ids = []
   issue_to_pr = {}
   for pr in has_jira:
-    jira_issue_str = re.match(r'.*\b((LUCENE)-\d{3,6})\b', pr.title).group(1)
+    match = re.match(r'.*\b((LUCENE)-\d{3,6})\b', pr.title)
+    assert match
+    jira_issue_str = match.group(1)
     issue_ids.append(jira_issue_str)
     issue_to_pr[jira_issue_str] = pr
 
-  resolved_jiras = jira.search_issues(jql_str="key in (%s) AND status in ('Closed', 'Resolved')" % ", ".join(issue_ids))
+  resolved_jiras = cast(ResultList[Issue], jira.search_issues(jql_str="key in (%s) AND status in ('Closed', 'Resolved')" % ", ".join(issue_ids)))
   closed_jiras = []
   for issue in resolved_jiras:
     pr_title = issue_to_pr[issue.key].title
     pr_number = issue_to_pr[issue.key].number
     assignee = issue.fields.assignee.name if issue.fields.assignee else None
+    resolution = issue.fields.resolution.name if issue.fields.resolution else None
     closed_jiras.append({ 'issue_key': issue.key,
                            'status': issue.fields.status.name,
-                           'resolution': issue.fields.resolution.name,
+                           'resolution': resolution,
                            'resolution_date': issue.fields.resolutiondate[:10],
                            'pr_number': pr_number,
                            'pr_title': pr_title,

diff --git a/dev-tools/scripts/pyproject.toml b/dev-tools/scripts/pyproject.toml
@@ -4,16 +4,7 @@ venv = ".venv"
 # TODO: improve!
 # typeCheckingMode = "strict"
 reportUnnecessaryTypeIgnoreComment = "error"
-typeCheckingMode = "basic"
-# TODO: we should fix these
-reportArgumentType = "none"
-reportAttributeAccessIssue = "none"
-reportCallIssue = "none"
-reportInvalidStringEscapeSequence = "none"
-reportOperatorIssue = "none"
-reportOptionalIterable = "none"
-reportOptionalMemberAccess = "none"
-reportOptionalSubscript = "none"
+typeCheckingMode = "standard"
 
 [tool.ruff]
 line-length = 200